From a5d9265e017f081f0dc133c0e2f45103d027b874 Mon Sep 17 00:00:00 2001
From: Alban Crequy <alban@kinvolk.io>
Date: Tue, 19 Feb 2019 15:13:32 +0100
Subject: bpf: bpftool, fix documentation for attach types

bpftool has support for attach types "stream_verdict" and
"stream_parser" but the documentation was referring to them as
"skb_verdict" and "skb_parse". The inconsistency comes from commit
b7d3826c2ed6 ("bpf: bpftool, add support for attaching programs to
maps").

This patch changes the documentation to match the implementation:
- "bpftool prog help"
- man pages
- bash completion

Signed-off-by: Alban Crequy <alban@kinvolk.io>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-prog.rst | 2 +-
 tools/bpf/bpftool/bash-completion/bpftool        | 4 ++--
 tools/bpf/bpftool/prog.c                         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 7e59495cb028..12bc1e2d4b46 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -42,7 +42,7 @@ PROG COMMANDS
 |		**cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6**
 |	}
 |       *ATTACH_TYPE* := {
-|		**msg_verdict** | **skb_verdict** | **skb_parse** | **flow_dissector**
+|		**msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector**
 |	}
 
 
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 763dd12482aa..b803827d01e8 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -311,8 +311,8 @@ _bpftool()
                             return 0
                             ;;
                         5)
-                            COMPREPLY=( $( compgen -W 'msg_verdict skb_verdict \
-                                skb_parse flow_dissector' -- "$cur" ) )
+                            COMPREPLY=( $( compgen -W 'msg_verdict stream_verdict \
+                                stream_parser flow_dissector' -- "$cur" ) )
                             return 0
                             ;;
                         6)
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 33ed0806ccc0..db978c8d76a8 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1199,7 +1199,7 @@ static int do_help(int argc, char **argv)
 		"                 cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
 		"                 cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
 		"                 cgroup/sendmsg4 | cgroup/sendmsg6 }\n"
-		"       ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse |\n"
+		"       ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
 		"                        flow_dissector }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
-- 
cgit v1.2.3-59-g8ed1b


From 568f196756ad9fe2b49c46bbf6a9de1b190438b4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Jan 2019 17:21:52 -0800
Subject: bpf: check that BPF programs run with preemption disabled

Introduce cant_sleep() macro for annotation of functions that
cannot sleep.

Use it in BPF_PROG_RUN to catch execution of BPF programs in
preemptable context.

Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |  2 +-
 include/linux/kernel.h | 14 ++++++++++++--
 kernel/sched/core.c    | 28 ++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 95e2d7ebdf21..f32b3eca5a04 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -533,7 +533,7 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
+#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 8f0e68e250a7..a8868a32098c 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -245,8 +245,10 @@ extern int _cond_resched(void);
 #endif
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-  void ___might_sleep(const char *file, int line, int preempt_offset);
-  void __might_sleep(const char *file, int line, int preempt_offset);
+extern void ___might_sleep(const char *file, int line, int preempt_offset);
+extern void __might_sleep(const char *file, int line, int preempt_offset);
+extern void __cant_sleep(const char *file, int line, int preempt_offset);
+
 /**
  * might_sleep - annotation for functions that can sleep
  *
@@ -259,6 +261,13 @@ extern int _cond_resched(void);
  */
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
+/**
+ * cant_sleep - annotation for functions that cannot sleep
+ *
+ * this macro will print a stack trace if it is executed with preemption enabled
+ */
+# define cant_sleep() \
+	do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
 # define sched_annotate_sleep()	(current->task_state_change = 0)
 #else
   static inline void ___might_sleep(const char *file, int line,
@@ -266,6 +275,7 @@ extern int _cond_resched(void);
   static inline void __might_sleep(const char *file, int line,
 				   int preempt_offset) { }
 # define might_sleep() do { might_resched(); } while (0)
+# define cant_sleep() do { } while (0)
 # define sched_annotate_sleep() do { } while (0)
 #endif
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8d76a65cfdd..7cbb5658be80 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6162,6 +6162,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL(___might_sleep);
+
+void __cant_sleep(const char *file, int line, int preempt_offset)
+{
+	static unsigned long prev_jiffy;
+
+	if (irqs_disabled())
+		return;
+
+	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+		return;
+
+	if (preempt_count() > preempt_offset)
+		return;
+
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
+	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	debug_show_held_locks(current);
+	dump_stack();
+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-- 
cgit v1.2.3-59-g8ed1b


From 74e31ca850c1cddeca03503171dd145b6ce293b6 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 19 Feb 2019 19:53:02 +0100
Subject: bpf: add skb->queue_mapping write access from tc clsact

The skb->queue_mapping already have read access, via __sk_buff->queue_mapping.

This patch allow BPF tc qdisc clsact write access to the queue_mapping via
tc_cls_act_is_valid_access.  Also handle that the value NO_QUEUE_MAPPING
is not allowed.

It is already possible to change this via TC filter action skbedit
tc-skbedit(8).  Due to the lack of TC examples, lets show one:

  # tc qdisc  add  dev ixgbe1 clsact
  # tc filter add  dev ixgbe1 ingress matchall action skbedit queue_mapping 5
  # tc filter list dev ixgbe1 ingress

The most common mistake is that XPS (Transmit Packet Steering) takes
precedence over setting skb->queue_mapping. XPS is configured per DEVICE
via /sys/class/net/DEVICE/queues/tx-*/xps_cpus via a CPU hex mask. To
disable set mask=00.

The purpose of changing skb->queue_mapping is to influence the selection of
the net_device "txq" (struct netdev_queue), which influence selection of
the qdisc "root_lock" (via txq->qdisc->q.lock) and txq->_xmit_lock. When
using the MQ qdisc the txq->qdisc points to different qdiscs and associated
locks, and HARD_TX_LOCK (txq->_xmit_lock), allowing for CPU scalability.

Due to lack of TC examples, lets show howto attach clsact BPF programs:

 # tc qdisc  add  dev ixgbe2 clsact
 # tc filter add  dev ixgbe2 egress bpf da obj XXX_kern.o sec tc_qmap2cpu
 # tc filter list dev ixgbe2 egress

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index b584cb42a803..85749f6ec789 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6279,6 +6279,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		case bpf_ctx_range(struct __sk_buff, tc_classid):
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 		case bpf_ctx_range(struct __sk_buff, tstamp):
+		case bpf_ctx_range(struct __sk_buff, queue_mapping):
 			break;
 		default:
 			return false;
@@ -6683,9 +6684,18 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 	case offsetof(struct __sk_buff, queue_mapping):
-		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
-				      bpf_target_off(struct sk_buff, queue_mapping, 2,
-						     target_size));
+		if (type == BPF_WRITE) {
+			*insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
+			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
+					      bpf_target_off(struct sk_buff,
+							     queue_mapping,
+							     2, target_size));
+		} else {
+			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+					      bpf_target_off(struct sk_buff,
+							     queue_mapping,
+							     2, target_size));
+		}
 		break;
 
 	case offsetof(struct __sk_buff, vlan_present):
-- 
cgit v1.2.3-59-g8ed1b


From e80d02dd763093f70c3000ef34253a6d426becf6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 21 Feb 2019 10:40:14 -0800
Subject: seccomp, bpf: disable preemption before calling into bpf prog

All BPF programs must be called with preemption disabled.

Fixes: 568f196756ad ("bpf: check that BPF programs run with preemption disabled")
Reported-by: syzbot+8bf19ee2aa580de7a2a7@syzkaller.appspotmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/seccomp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e815781ed751..a43c601ac252 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -267,6 +267,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
 	 */
+	preempt_disable();
 	for (; f; f = f->prev) {
 		u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
 
@@ -275,6 +276,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 			*match = f;
 		}
 	}
+	preempt_enable();
 	return ret;
 }
 #endif /* CONFIG_SECCOMP_FILTER */
-- 
cgit v1.2.3-59-g8ed1b


From 915654fd718c2366871b19f8c6687e61909db911 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Thu, 21 Feb 2019 17:05:39 +0100
Subject: samples/bpf: Fix dummy program unloading for xdp_redirect samples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xdp_redirect and xdp_redirect_map sample programs both load a dummy
program onto the egress interfaces. However, the unload code checks these
programs against the wrong id number, and thus refuses to unload them. Fix
the comparison to avoid this.

Fixes: 3b7a8ec2dec3 ("samples/bpf: Check the prog id before exiting")
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdp_redirect_map_user.c | 2 +-
 samples/bpf/xdp_redirect_user.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index 327226be5a06..1dbe7fd3a1a8 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -57,7 +57,7 @@ static void int_exit(int sig)
 			printf("bpf_get_link_xdp_id failed\n");
 			exit(1);
 		}
-		if (prog_id == curr_prog_id)
+		if (dummy_prog_id == curr_prog_id)
 			bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
 		else if (!curr_prog_id)
 			printf("couldn't find a prog id on iface OUT\n");
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index a5d8ad3129ed..e9054c0269ff 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -57,7 +57,7 @@ static void int_exit(int sig)
 			printf("bpf_get_link_xdp_id failed\n");
 			exit(1);
 		}
-		if (prog_id == curr_prog_id)
+		if (dummy_prog_id == curr_prog_id)
 			bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
 		else if (!curr_prog_id)
 			printf("couldn't find a prog id on iface OUT\n");
-- 
cgit v1.2.3-59-g8ed1b


From fd92d6648feb83888e40f0aa2f74e1d9bbd2ad7d Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Fri, 22 Feb 2019 09:46:52 +0100
Subject: bpf: test_bpf: turn off preemption in function __run_once

When running BPF test suite the following splat occurs:

[  415.930950] test_bpf: #0 TAX jited:0
[  415.931067] BUG: assuming atomic context at lib/test_bpf.c:6674
[  415.946169] in_atomic(): 0, irqs_disabled(): 0, pid: 11556, name: modprobe
[  415.953176] INFO: lockdep is turned off.
[  415.957207] CPU: 1 PID: 11556 Comm: modprobe Tainted: G        W         5.0.0-rc7-next-20190220 #1
[  415.966328] Hardware name: HiKey Development Board (DT)
[  415.971592] Call trace:
[  415.974069]  dump_backtrace+0x0/0x160
[  415.977761]  show_stack+0x24/0x30
[  415.981104]  dump_stack+0xc8/0x114
[  415.984534]  __cant_sleep+0xf0/0x108
[  415.988145]  test_bpf_init+0x5e0/0x1000 [test_bpf]
[  415.992971]  do_one_initcall+0x90/0x428
[  415.996837]  do_init_module+0x60/0x1e4
[  416.000614]  load_module+0x1de0/0x1f50
[  416.004391]  __se_sys_finit_module+0xc8/0xe0
[  416.008691]  __arm64_sys_finit_module+0x24/0x30
[  416.013255]  el0_svc_common+0x78/0x130
[  416.017031]  el0_svc_handler+0x38/0x78
[  416.020806]  el0_svc+0x8/0xc

Rework so that preemption is disabled when we loop over function
'BPF_PROG_RUN(...)'.

Fixes: 568f196756ad ("bpf: check that BPF programs run with preemption disabled")
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 lib/test_bpf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index f3e570722a7e..0845f635f404 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -6668,12 +6668,14 @@ static int __run_one(const struct bpf_prog *fp, const void *data,
 	u64 start, finish;
 	int ret = 0, i;
 
+	preempt_disable();
 	start = ktime_get_ns();
 
 	for (i = 0; i < runs; i++)
 		ret = BPF_PROG_RUN(fp, data);
 
 	finish = ktime_get_ns();
+	preempt_enable();
 
 	*duration = finish - start;
 	do_div(*duration, runs);
-- 
cgit v1.2.3-59-g8ed1b


From a439184d515fbf4805f57d11fa5dfd4524d2c0eb Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 19 Feb 2019 10:54:17 -0800
Subject: bpf/test_run: fix unkillable BPF_PROG_TEST_RUN for flow dissector

Syzbot found out that running BPF_PROG_TEST_RUN with repeat=0xffffffff
makes process unkillable. The problem is that when CONFIG_PREEMPT is
enabled, we never see need_resched() return true. This is due to the
fact that preempt_enable() (which we do in bpf_test_run_one on each
iteration) now handles resched if it's needed.

Let's disable preemption for the whole run, not per test. In this case
we can properly see whether resched is needed.
Let's also properly return -EINTR to the userspace in case of a signal
interrupt.

This is a follow up for a recently fixed issue in bpf_test_run, see
commit df1a2cb7c74b ("bpf/test_run: fix unkillable
BPF_PROG_TEST_RUN").

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/bpf/test_run.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 2c5172b33209..619655db8d9e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -293,31 +293,45 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	if (!repeat)
 		repeat = 1;
 
+	rcu_read_lock();
+	preempt_disable();
 	time_start = ktime_get_ns();
 	for (i = 0; i < repeat; i++) {
-		preempt_disable();
-		rcu_read_lock();
 		retval = __skb_flow_bpf_dissect(prog, skb,
 						&flow_keys_dissector,
 						&flow_keys);
-		rcu_read_unlock();
-		preempt_enable();
+
+		if (signal_pending(current)) {
+			preempt_enable();
+			rcu_read_unlock();
+
+			ret = -EINTR;
+			goto out;
+		}
 
 		if (need_resched()) {
-			if (signal_pending(current))
-				break;
 			time_spent += ktime_get_ns() - time_start;
+			preempt_enable();
+			rcu_read_unlock();
+
 			cond_resched();
+
+			rcu_read_lock();
+			preempt_disable();
 			time_start = ktime_get_ns();
 		}
 	}
 	time_spent += ktime_get_ns() - time_start;
+	preempt_enable();
+	rcu_read_unlock();
+
 	do_div(time_spent, repeat);
 	duration = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
 
 	ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
 			      retval, duration);
 
+out:
 	kfree_skb(skb);
 	kfree(sk);
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 740f8a6572213a677c7916b12a11d831373722ce Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 21 Feb 2019 11:11:00 -0800
Subject: selftests/bpf: make sure signal interrupts BPF_PROG_TEST_RUN

Simple test that I used to reproduce the issue in the previous commit:
Do BPF_PROG_TEST_RUN with max iterations, each program is 4096 simple
move instructions. File alarm in 0.1 second and check that
bpf_prog_test_run is interrupted (i.e. test doesn't hang).

Note: reposting this for bpf-next to avoid linux-next conflict. In this
version I test both BPF_PROG_TYPE_SOCKET_FILTER (which uses generic
bpf_test_run implementation) and BPF_PROG_TYPE_FLOW_DISSECTOR (which has
it own loop with preempt handling in bpf_prog_test_run_flow_dissector).

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_progs.c | 44 ++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index c52bd90fbb34..c59d2e015d16 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <time.h>
+#include <signal.h>
 
 #include <linux/types.h>
 typedef __u16 __sum16;
@@ -28,6 +29,7 @@ typedef __u16 __sum16;
 #include <sys/ioctl.h>
 #include <sys/wait.h>
 #include <sys/types.h>
+#include <sys/time.h>
 #include <fcntl.h>
 #include <pthread.h>
 #include <linux/bpf.h>
@@ -2108,6 +2110,46 @@ close_prog_noerr:
 	bpf_object__close(obj);
 }
 
+static void sigalrm_handler(int s) {}
+static struct sigaction sigalrm_action = {
+	.sa_handler = sigalrm_handler,
+};
+
+static void test_signal_pending(enum bpf_prog_type prog_type)
+{
+	struct bpf_insn prog[4096];
+	struct itimerval timeo = {
+		.it_value.tv_usec = 100000, /* 100ms */
+	};
+	__u32 duration, retval;
+	int prog_fd;
+	int err;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(prog); i++)
+		prog[i] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0);
+	prog[ARRAY_SIZE(prog) - 1] = BPF_EXIT_INSN();
+
+	prog_fd = bpf_load_program(prog_type, prog, ARRAY_SIZE(prog),
+				   "GPL", 0, NULL, 0);
+	CHECK(prog_fd < 0, "test-run", "errno %d\n", errno);
+
+	err = sigaction(SIGALRM, &sigalrm_action, NULL);
+	CHECK(err, "test-run-signal-sigaction", "errno %d\n", errno);
+
+	err = setitimer(ITIMER_REAL, &timeo, NULL);
+	CHECK(err, "test-run-signal-timer", "errno %d\n", errno);
+
+	err = bpf_prog_test_run(prog_fd, 0xffffffff, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, &retval, &duration);
+	CHECK(duration > 500000000, /* 500ms */
+	      "test-run-signal-duration",
+	      "duration %dns > 500ms\n",
+	      duration);
+
+	signal(SIGALRM, SIG_DFL);
+}
+
 int main(void)
 {
 	srand(time(NULL));
@@ -2138,6 +2180,8 @@ int main(void)
 	test_flow_dissector();
 	test_spinlock();
 	test_map_lock();
+	test_signal_pending(BPF_PROG_TYPE_SOCKET_FILTER);
+	test_signal_pending(BPF_PROG_TYPE_FLOW_DISSECTOR);
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
cgit v1.2.3-59-g8ed1b


From 1cad078842396f0047a796694b6130fc096d97e2 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Thu, 21 Feb 2019 10:21:26 +0100
Subject: libbpf: add support for using AF_XDP sockets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit adds AF_XDP support to libbpf. The main reason for this is
to facilitate writing applications that use AF_XDP by offering
higher-level APIs that hide many of the details of the AF_XDP
uapi. This is in the same vein as libbpf facilitates XDP adoption by
offering easy-to-use higher level interfaces of XDP
functionality. Hopefully this will facilitate adoption of AF_XDP, make
applications using it simpler and smaller, and finally also make it
possible for applications to benefit from optimizations in the AF_XDP
user space access code. Previously, people just copied and pasted the
code from the sample application into their application, which is not
desirable.

The interface is composed of two parts:

* Low-level access interface to the four rings and the packet
* High-level control plane interface for creating and setting
  up umems and af_xdp sockets as well as a simple XDP program.

Tested-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/ethtool.h |  51 +++
 tools/include/uapi/linux/if_xdp.h  |  78 ++++
 tools/lib/bpf/Build                |   2 +-
 tools/lib/bpf/Makefile             |   5 +-
 tools/lib/bpf/README.rst           |  15 +-
 tools/lib/bpf/libbpf.map           |   6 +
 tools/lib/bpf/xsk.c                | 723 +++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/xsk.h                | 203 +++++++++++
 8 files changed, 1080 insertions(+), 3 deletions(-)
 create mode 100644 tools/include/uapi/linux/ethtool.h
 create mode 100644 tools/include/uapi/linux/if_xdp.h
 create mode 100644 tools/lib/bpf/xsk.c
 create mode 100644 tools/lib/bpf/xsk.h

diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h
new file mode 100644
index 000000000000..c86c3e942df9
--- /dev/null
+++ b/tools/include/uapi/linux/ethtool.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * ethtool.h: Defines for Linux ethtool.
+ *
+ * Copyright (C) 1998 David S. Miller (davem@redhat.com)
+ * Copyright 2001 Jeff Garzik <jgarzik@pobox.com>
+ * Portions Copyright 2001 Sun Microsystems (thockin@sun.com)
+ * Portions Copyright 2002 Intel (eli.kupermann@intel.com,
+ *                                christopher.leech@intel.com,
+ *                                scott.feldman@intel.com)
+ * Portions Copyright (C) Sun Microsystems 2008
+ */
+
+#ifndef _UAPI_LINUX_ETHTOOL_H
+#define _UAPI_LINUX_ETHTOOL_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#define ETHTOOL_GCHANNELS       0x0000003c /* Get no of channels */
+
+/**
+ * struct ethtool_channels - configuring number of network channel
+ * @cmd: ETHTOOL_{G,S}CHANNELS
+ * @max_rx: Read only. Maximum number of receive channel the driver support.
+ * @max_tx: Read only. Maximum number of transmit channel the driver support.
+ * @max_other: Read only. Maximum number of other channel the driver support.
+ * @max_combined: Read only. Maximum number of combined channel the driver
+ *	support. Set of queues RX, TX or other.
+ * @rx_count: Valid values are in the range 1 to the max_rx.
+ * @tx_count: Valid values are in the range 1 to the max_tx.
+ * @other_count: Valid values are in the range 1 to the max_other.
+ * @combined_count: Valid values are in the range 1 to the max_combined.
+ *
+ * This can be used to configure RX, TX and other channels.
+ */
+
+struct ethtool_channels {
+	__u32	cmd;
+	__u32	max_rx;
+	__u32	max_tx;
+	__u32	max_other;
+	__u32	max_combined;
+	__u32	rx_count;
+	__u32	tx_count;
+	__u32	other_count;
+	__u32	combined_count;
+};
+
+#endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
new file mode 100644
index 000000000000..caed8b1614ff
--- /dev/null
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * if_xdp: XDP socket user-space interface
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * Author(s): Björn Töpel <bjorn.topel@intel.com>
+ *	      Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef _LINUX_IF_XDP_H
+#define _LINUX_IF_XDP_H
+
+#include <linux/types.h>
+
+/* Options for the sxdp_flags field */
+#define XDP_SHARED_UMEM	(1 << 0)
+#define XDP_COPY	(1 << 1) /* Force copy-mode */
+#define XDP_ZEROCOPY	(1 << 2) /* Force zero-copy mode */
+
+struct sockaddr_xdp {
+	__u16 sxdp_family;
+	__u16 sxdp_flags;
+	__u32 sxdp_ifindex;
+	__u32 sxdp_queue_id;
+	__u32 sxdp_shared_umem_fd;
+};
+
+struct xdp_ring_offset {
+	__u64 producer;
+	__u64 consumer;
+	__u64 desc;
+};
+
+struct xdp_mmap_offsets {
+	struct xdp_ring_offset rx;
+	struct xdp_ring_offset tx;
+	struct xdp_ring_offset fr; /* Fill */
+	struct xdp_ring_offset cr; /* Completion */
+};
+
+/* XDP socket options */
+#define XDP_MMAP_OFFSETS		1
+#define XDP_RX_RING			2
+#define XDP_TX_RING			3
+#define XDP_UMEM_REG			4
+#define XDP_UMEM_FILL_RING		5
+#define XDP_UMEM_COMPLETION_RING	6
+#define XDP_STATISTICS			7
+
+struct xdp_umem_reg {
+	__u64 addr; /* Start of packet data area */
+	__u64 len; /* Length of packet data area */
+	__u32 chunk_size;
+	__u32 headroom;
+};
+
+struct xdp_statistics {
+	__u64 rx_dropped; /* Dropped for reasons other than invalid desc */
+	__u64 rx_invalid_descs; /* Dropped due to invalid descriptor */
+	__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
+};
+
+/* Pgoff for mmaping the rings */
+#define XDP_PGOFF_RX_RING			  0
+#define XDP_PGOFF_TX_RING		 0x80000000
+#define XDP_UMEM_PGOFF_FILL_RING	0x100000000ULL
+#define XDP_UMEM_PGOFF_COMPLETION_RING	0x180000000ULL
+
+/* Rx/Tx descriptor */
+struct xdp_desc {
+	__u64 addr;
+	__u32 len;
+	__u32 options;
+};
+
+/* UMEM descriptor is __u64 */
+
+#endif /* _LINUX_IF_XDP_H */
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index bfd9bfc82c3b..ee9d5362f35b 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 847916273696..761691bd72ad 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -164,6 +164,9 @@ $(BPF_IN): force elfdep bpfdep
 	@(test -f ../../include/uapi/linux/if_link.h -a -f ../../../include/uapi/linux/if_link.h && ( \
 	(diff -B ../../include/uapi/linux/if_link.h ../../../include/uapi/linux/if_link.h >/dev/null) || \
 	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_link.h' differs from latest version at 'include/uapi/linux/if_link.h'" >&2 )) || true
+	@(test -f ../../include/uapi/linux/if_xdp.h -a -f ../../../include/uapi/linux/if_xdp.h && ( \
+	(diff -B ../../include/uapi/linux/if_xdp.h ../../../include/uapi/linux/if_xdp.h >/dev/null) || \
+	echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true
 	$(Q)$(MAKE) $(build)=libbpf
 
 $(OUTPUT)libbpf.so: $(BPF_IN)
@@ -174,7 +177,7 @@ $(OUTPUT)libbpf.a: $(BPF_IN)
 	$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
 
 $(OUTPUT)test_libbpf: test_libbpf.cpp $(OUTPUT)libbpf.a
-	$(QUIET_LINK)$(CXX) $^ -lelf -o $@
+	$(QUIET_LINK)$(CXX) $(INCLUDES) $^ -lelf -o $@
 
 check: check_abi
 
diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst
index 607aae40f4ed..5788479384ca 100644
--- a/tools/lib/bpf/README.rst
+++ b/tools/lib/bpf/README.rst
@@ -9,7 +9,7 @@ described here. It's recommended to follow these conventions whenever a
 new function or type is added to keep libbpf API clean and consistent.
 
 All types and functions provided by libbpf API should have one of the
-following prefixes: ``bpf_``, ``btf_``, ``libbpf_``.
+following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``.
 
 System call wrappers
 --------------------
@@ -62,6 +62,19 @@ Auxiliary functions and types that don't fit well in any of categories
 described above should have ``libbpf_`` prefix, e.g.
 ``libbpf_get_error`` or ``libbpf_prog_type_by_name``.
 
+AF_XDP functions
+-------------------
+
+AF_XDP functions should have an ``xsk_`` prefix, e.g.
+``xsk_umem__get_data`` or ``xsk_umem__create``. The interface consists
+of both low-level ring access functions and high-level configuration
+functions. These can be mixed and matched. Note that these functions
+are not reentrant for performance reasons.
+
+Please take a look at Documentation/networking/af_xdp.rst in the Linux
+kernel source tree on how to use XDP sockets and for some common
+mistakes in case you do not get any traffic up to user space.
+
 libbpf ABI
 ==========
 
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 99dfa710c818..778a26702a70 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -147,4 +147,10 @@ LIBBPF_0.0.2 {
 		btf_ext__new;
 		btf_ext__reloc_func_info;
 		btf_ext__reloc_line_info;
+		xsk_umem__create;
+		xsk_socket__create;
+		xsk_umem__delete;
+		xsk_socket__delete;
+		xsk_umem__fd;
+		xsk_socket__fd;
 } LIBBPF_0.0.1;
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
new file mode 100644
index 000000000000..f98ac82c9aea
--- /dev/null
+++ b/tools/lib/bpf/xsk.c
@@ -0,0 +1,723 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * AF_XDP user-space access library.
+ *
+ * Copyright(c) 2018 - 2019 Intel Corporation.
+ *
+ * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <asm/barrier.h>
+#include <linux/compiler.h>
+#include <linux/ethtool.h>
+#include <linux/filter.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_xdp.h>
+#include <linux/sockios.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "bpf.h"
+#include "libbpf.h"
+#include "libbpf_util.h"
+#include "xsk.h"
+
+#ifndef SOL_XDP
+ #define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+ #define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+ #define PF_XDP AF_XDP
+#endif
+
+struct xsk_umem {
+	struct xsk_ring_prod *fill;
+	struct xsk_ring_cons *comp;
+	char *umem_area;
+	struct xsk_umem_config config;
+	int fd;
+	int refcount;
+};
+
+struct xsk_socket {
+	struct xsk_ring_cons *rx;
+	struct xsk_ring_prod *tx;
+	__u64 outstanding_tx;
+	struct xsk_umem *umem;
+	struct xsk_socket_config config;
+	int fd;
+	int xsks_map;
+	int ifindex;
+	int prog_fd;
+	int qidconf_map_fd;
+	int xsks_map_fd;
+	__u32 queue_id;
+	char ifname[IFNAMSIZ];
+};
+
+struct xsk_nl_info {
+	bool xdp_prog_attached;
+	int ifindex;
+	int fd;
+};
+
+/* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit.
+ * Unfortunately, it is not part of glibc.
+ */
+static inline void *xsk_mmap(void *addr, size_t length, int prot, int flags,
+			     int fd, __u64 offset)
+{
+#ifdef __NR_mmap2
+	unsigned int page_shift = __builtin_ffs(getpagesize()) - 1;
+	long ret = syscall(__NR_mmap2, addr, length, prot, flags, fd,
+			   (off_t)(offset >> page_shift));
+
+	return (void *)ret;
+#else
+	return mmap(addr, length, prot, flags, fd, offset);
+#endif
+}
+
+int xsk_umem__fd(const struct xsk_umem *umem)
+{
+	return umem ? umem->fd : -EINVAL;
+}
+
+int xsk_socket__fd(const struct xsk_socket *xsk)
+{
+	return xsk ? xsk->fd : -EINVAL;
+}
+
+static bool xsk_page_aligned(void *buffer)
+{
+	unsigned long addr = (unsigned long)buffer;
+
+	return !(addr & (getpagesize() - 1));
+}
+
+static void xsk_set_umem_config(struct xsk_umem_config *cfg,
+				const struct xsk_umem_config *usr_cfg)
+{
+	if (!usr_cfg) {
+		cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+		cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+		cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+		cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+		return;
+	}
+
+	cfg->fill_size = usr_cfg->fill_size;
+	cfg->comp_size = usr_cfg->comp_size;
+	cfg->frame_size = usr_cfg->frame_size;
+	cfg->frame_headroom = usr_cfg->frame_headroom;
+}
+
+static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
+				      const struct xsk_socket_config *usr_cfg)
+{
+	if (!usr_cfg) {
+		cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+		cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+		cfg->libbpf_flags = 0;
+		cfg->xdp_flags = 0;
+		cfg->bind_flags = 0;
+		return;
+	}
+
+	cfg->rx_size = usr_cfg->rx_size;
+	cfg->tx_size = usr_cfg->tx_size;
+	cfg->libbpf_flags = usr_cfg->libbpf_flags;
+	cfg->xdp_flags = usr_cfg->xdp_flags;
+	cfg->bind_flags = usr_cfg->bind_flags;
+}
+
+int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
+		     struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
+		     const struct xsk_umem_config *usr_config)
+{
+	struct xdp_mmap_offsets off;
+	struct xdp_umem_reg mr;
+	struct xsk_umem *umem;
+	socklen_t optlen;
+	void *map;
+	int err;
+
+	if (!umem_area || !umem_ptr || !fill || !comp)
+		return -EFAULT;
+	if (!size && !xsk_page_aligned(umem_area))
+		return -EINVAL;
+
+	umem = calloc(1, sizeof(*umem));
+	if (!umem)
+		return -ENOMEM;
+
+	umem->fd = socket(AF_XDP, SOCK_RAW, 0);
+	if (umem->fd < 0) {
+		err = -errno;
+		goto out_umem_alloc;
+	}
+
+	umem->umem_area = umem_area;
+	xsk_set_umem_config(&umem->config, usr_config);
+
+	mr.addr = (uintptr_t)umem_area;
+	mr.len = size;
+	mr.chunk_size = umem->config.frame_size;
+	mr.headroom = umem->config.frame_headroom;
+
+	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING,
+			 &umem->config.fill_size,
+			 sizeof(umem->config.fill_size));
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
+			 &umem->config.comp_size,
+			 sizeof(umem->config.comp_size));
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	optlen = sizeof(off);
+	err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	map = xsk_mmap(NULL, off.fr.desc +
+		       umem->config.fill_size * sizeof(__u64),
+		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+		       umem->fd, XDP_UMEM_PGOFF_FILL_RING);
+	if (map == MAP_FAILED) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	umem->fill = fill;
+	fill->mask = umem->config.fill_size - 1;
+	fill->size = umem->config.fill_size;
+	fill->producer = map + off.fr.producer;
+	fill->consumer = map + off.fr.consumer;
+	fill->ring = map + off.fr.desc;
+	fill->cached_cons = umem->config.fill_size;
+
+	map = xsk_mmap(NULL,
+		       off.cr.desc + umem->config.comp_size * sizeof(__u64),
+		       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+		       umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
+	if (map == MAP_FAILED) {
+		err = -errno;
+		goto out_mmap;
+	}
+
+	umem->comp = comp;
+	comp->mask = umem->config.comp_size - 1;
+	comp->size = umem->config.comp_size;
+	comp->producer = map + off.cr.producer;
+	comp->consumer = map + off.cr.consumer;
+	comp->ring = map + off.cr.desc;
+
+	*umem_ptr = umem;
+	return 0;
+
+out_mmap:
+	munmap(umem->fill,
+	       off.fr.desc + umem->config.fill_size * sizeof(__u64));
+out_socket:
+	close(umem->fd);
+out_umem_alloc:
+	free(umem);
+	return err;
+}
+
+static int xsk_load_xdp_prog(struct xsk_socket *xsk)
+{
+	char bpf_log_buf[BPF_LOG_BUF_SIZE];
+	int err, prog_fd;
+
+	/* This is the C-program:
+	 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
+	 * {
+	 *     int *qidconf, index = ctx->rx_queue_index;
+	 *
+	 *     // A set entry here means that the correspnding queue_id
+	 *     // has an active AF_XDP socket bound to it.
+	 *     qidconf = bpf_map_lookup_elem(&qidconf_map, &index);
+	 *     if (!qidconf)
+	 *         return XDP_ABORTED;
+	 *
+	 *     if (*qidconf)
+	 *         return bpf_redirect_map(&xsks_map, index, 0);
+	 *
+	 *     return XDP_PASS;
+	 * }
+	 */
+	struct bpf_insn prog[] = {
+		/* r1 = *(u32 *)(r1 + 16) */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 16),
+		/* *(u32 *)(r10 - 4) = r1 */
+		BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_1, -4),
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+		BPF_LD_MAP_FD(BPF_REG_1, xsk->qidconf_map_fd),
+		BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+		BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		/* if r1 == 0 goto +8 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8),
+		BPF_MOV32_IMM(BPF_REG_0, 2),
+		/* r1 = *(u32 *)(r1 + 0) */
+		BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+		/* if r1 == 0 goto +5 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
+		/* r2 = *(u32 *)(r10 - 4) */
+		BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd),
+		BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
+		BPF_MOV32_IMM(BPF_REG_3, 0),
+		BPF_EMIT_CALL(BPF_FUNC_redirect_map),
+		/* The jumps are to this instruction */
+		BPF_EXIT_INSN(),
+	};
+	size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+	prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt,
+				   "LGPL-2.1 or BSD-2-Clause", 0, bpf_log_buf,
+				   BPF_LOG_BUF_SIZE);
+	if (prog_fd < 0) {
+		pr_warning("BPF log buffer:\n%s", bpf_log_buf);
+		return prog_fd;
+	}
+
+	err = bpf_set_link_xdp_fd(xsk->ifindex, prog_fd, xsk->config.xdp_flags);
+	if (err) {
+		close(prog_fd);
+		return err;
+	}
+
+	xsk->prog_fd = prog_fd;
+	return 0;
+}
+
+static int xsk_get_max_queues(struct xsk_socket *xsk)
+{
+	struct ethtool_channels channels;
+	struct ifreq ifr;
+	int fd, err, ret;
+
+	fd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (fd < 0)
+		return -errno;
+
+	channels.cmd = ETHTOOL_GCHANNELS;
+	ifr.ifr_data = (void *)&channels;
+	strncpy(ifr.ifr_name, xsk->ifname, IFNAMSIZ);
+	err = ioctl(fd, SIOCETHTOOL, &ifr);
+	if (err && errno != EOPNOTSUPP) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (channels.max_combined == 0 || errno == EOPNOTSUPP)
+		/* If the device says it has no channels, then all traffic
+		 * is sent to a single stream, so max queues = 1.
+		 */
+		ret = 1;
+	else
+		ret = channels.max_combined;
+
+out:
+	close(fd);
+	return ret;
+}
+
+static int xsk_create_bpf_maps(struct xsk_socket *xsk)
+{
+	int max_queues;
+	int fd;
+
+	max_queues = xsk_get_max_queues(xsk);
+	if (max_queues < 0)
+		return max_queues;
+
+	fd = bpf_create_map_name(BPF_MAP_TYPE_ARRAY, "qidconf_map",
+				 sizeof(int), sizeof(int), max_queues, 0);
+	if (fd < 0)
+		return fd;
+	xsk->qidconf_map_fd = fd;
+
+	fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map",
+				 sizeof(int), sizeof(int), max_queues, 0);
+	if (fd < 0) {
+		close(xsk->qidconf_map_fd);
+		return fd;
+	}
+	xsk->xsks_map_fd = fd;
+
+	return 0;
+}
+
+static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
+{
+	close(xsk->qidconf_map_fd);
+	close(xsk->xsks_map_fd);
+}
+
+static int xsk_update_bpf_maps(struct xsk_socket *xsk, int qidconf_value,
+			       int xsks_value)
+{
+	bool qidconf_map_updated = false, xsks_map_updated = false;
+	struct bpf_prog_info prog_info = {};
+	__u32 prog_len = sizeof(prog_info);
+	struct bpf_map_info map_info;
+	__u32 map_len = sizeof(map_info);
+	__u32 *map_ids;
+	int reset_value = 0;
+	__u32 num_maps;
+	unsigned int i;
+	int err;
+
+	err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
+	if (err)
+		return err;
+
+	num_maps = prog_info.nr_map_ids;
+
+	map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
+	if (!map_ids)
+		return -ENOMEM;
+
+	memset(&prog_info, 0, prog_len);
+	prog_info.nr_map_ids = num_maps;
+	prog_info.map_ids = (__u64)(unsigned long)map_ids;
+
+	err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len);
+	if (err)
+		goto out_map_ids;
+
+	for (i = 0; i < prog_info.nr_map_ids; i++) {
+		int fd;
+
+		fd = bpf_map_get_fd_by_id(map_ids[i]);
+		if (fd < 0) {
+			err = -errno;
+			goto out_maps;
+		}
+
+		err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
+		if (err)
+			goto out_maps;
+
+		if (!strcmp(map_info.name, "qidconf_map")) {
+			err = bpf_map_update_elem(fd, &xsk->queue_id,
+						  &qidconf_value, 0);
+			if (err)
+				goto out_maps;
+			qidconf_map_updated = true;
+			xsk->qidconf_map_fd = fd;
+		} else if (!strcmp(map_info.name, "xsks_map")) {
+			err = bpf_map_update_elem(fd, &xsk->queue_id,
+						  &xsks_value, 0);
+			if (err)
+				goto out_maps;
+			xsks_map_updated = true;
+			xsk->xsks_map_fd = fd;
+		}
+
+		if (qidconf_map_updated && xsks_map_updated)
+			break;
+	}
+
+	if (!(qidconf_map_updated && xsks_map_updated)) {
+		err = -ENOENT;
+		goto out_maps;
+	}
+
+	err = 0;
+	goto out_success;
+
+out_maps:
+	if (qidconf_map_updated)
+		(void)bpf_map_update_elem(xsk->qidconf_map_fd, &xsk->queue_id,
+					  &reset_value, 0);
+	if (xsks_map_updated)
+		(void)bpf_map_update_elem(xsk->xsks_map_fd, &xsk->queue_id,
+					  &reset_value, 0);
+out_success:
+	if (qidconf_map_updated)
+		close(xsk->qidconf_map_fd);
+	if (xsks_map_updated)
+		close(xsk->xsks_map_fd);
+out_map_ids:
+	free(map_ids);
+	return err;
+}
+
+static int xsk_setup_xdp_prog(struct xsk_socket *xsk)
+{
+	bool prog_attached = false;
+	__u32 prog_id = 0;
+	int err;
+
+	err = bpf_get_link_xdp_id(xsk->ifindex, &prog_id,
+				  xsk->config.xdp_flags);
+	if (err)
+		return err;
+
+	if (!prog_id) {
+		prog_attached = true;
+		err = xsk_create_bpf_maps(xsk);
+		if (err)
+			return err;
+
+		err = xsk_load_xdp_prog(xsk);
+		if (err)
+			goto out_maps;
+	} else {
+		xsk->prog_fd = bpf_prog_get_fd_by_id(prog_id);
+	}
+
+	err = xsk_update_bpf_maps(xsk, true, xsk->fd);
+	if (err)
+		goto out_load;
+
+	return 0;
+
+out_load:
+	if (prog_attached)
+		close(xsk->prog_fd);
+out_maps:
+	if (prog_attached)
+		xsk_delete_bpf_maps(xsk);
+	return err;
+}
+
+int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
+		       __u32 queue_id, struct xsk_umem *umem,
+		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
+		       const struct xsk_socket_config *usr_config)
+{
+	struct sockaddr_xdp sxdp = {};
+	struct xdp_mmap_offsets off;
+	struct xsk_socket *xsk;
+	socklen_t optlen;
+	void *map;
+	int err;
+
+	if (!umem || !xsk_ptr || !rx || !tx)
+		return -EFAULT;
+
+	if (umem->refcount) {
+		pr_warning("Error: shared umems not supported by libbpf.\n");
+		return -EBUSY;
+	}
+
+	xsk = calloc(1, sizeof(*xsk));
+	if (!xsk)
+		return -ENOMEM;
+
+	if (umem->refcount++ > 0) {
+		xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
+		if (xsk->fd < 0) {
+			err = -errno;
+			goto out_xsk_alloc;
+		}
+	} else {
+		xsk->fd = umem->fd;
+	}
+
+	xsk->outstanding_tx = 0;
+	xsk->queue_id = queue_id;
+	xsk->umem = umem;
+	xsk->ifindex = if_nametoindex(ifname);
+	if (!xsk->ifindex) {
+		err = -errno;
+		goto out_socket;
+	}
+	strncpy(xsk->ifname, ifname, IFNAMSIZ);
+
+	xsk_set_xdp_socket_config(&xsk->config, usr_config);
+
+	if (rx) {
+		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
+				 &xsk->config.rx_size,
+				 sizeof(xsk->config.rx_size));
+		if (err) {
+			err = -errno;
+			goto out_socket;
+		}
+	}
+	if (tx) {
+		err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
+				 &xsk->config.tx_size,
+				 sizeof(xsk->config.tx_size));
+		if (err) {
+			err = -errno;
+			goto out_socket;
+		}
+	}
+
+	optlen = sizeof(off);
+	err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (err) {
+		err = -errno;
+		goto out_socket;
+	}
+
+	if (rx) {
+		map = xsk_mmap(NULL, off.rx.desc +
+			       xsk->config.rx_size * sizeof(struct xdp_desc),
+			       PROT_READ | PROT_WRITE,
+			       MAP_SHARED | MAP_POPULATE,
+			       xsk->fd, XDP_PGOFF_RX_RING);
+		if (map == MAP_FAILED) {
+			err = -errno;
+			goto out_socket;
+		}
+
+		rx->mask = xsk->config.rx_size - 1;
+		rx->size = xsk->config.rx_size;
+		rx->producer = map + off.rx.producer;
+		rx->consumer = map + off.rx.consumer;
+		rx->ring = map + off.rx.desc;
+	}
+	xsk->rx = rx;
+
+	if (tx) {
+		map = xsk_mmap(NULL, off.tx.desc +
+			       xsk->config.tx_size * sizeof(struct xdp_desc),
+			       PROT_READ | PROT_WRITE,
+			       MAP_SHARED | MAP_POPULATE,
+			       xsk->fd, XDP_PGOFF_TX_RING);
+		if (map == MAP_FAILED) {
+			err = -errno;
+			goto out_mmap_rx;
+		}
+
+		tx->mask = xsk->config.tx_size - 1;
+		tx->size = xsk->config.tx_size;
+		tx->producer = map + off.tx.producer;
+		tx->consumer = map + off.tx.consumer;
+		tx->ring = map + off.tx.desc;
+		tx->cached_cons = xsk->config.tx_size;
+	}
+	xsk->tx = tx;
+
+	sxdp.sxdp_family = PF_XDP;
+	sxdp.sxdp_ifindex = xsk->ifindex;
+	sxdp.sxdp_queue_id = xsk->queue_id;
+	sxdp.sxdp_flags = xsk->config.bind_flags;
+
+	err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
+	if (err) {
+		err = -errno;
+		goto out_mmap_tx;
+	}
+
+	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
+		err = xsk_setup_xdp_prog(xsk);
+		if (err)
+			goto out_mmap_tx;
+	}
+
+	*xsk_ptr = xsk;
+	return 0;
+
+out_mmap_tx:
+	if (tx)
+		munmap(xsk->tx,
+		       off.tx.desc +
+		       xsk->config.tx_size * sizeof(struct xdp_desc));
+out_mmap_rx:
+	if (rx)
+		munmap(xsk->rx,
+		       off.rx.desc +
+		       xsk->config.rx_size * sizeof(struct xdp_desc));
+out_socket:
+	if (--umem->refcount)
+		close(xsk->fd);
+out_xsk_alloc:
+	free(xsk);
+	return err;
+}
+
+int xsk_umem__delete(struct xsk_umem *umem)
+{
+	struct xdp_mmap_offsets off;
+	socklen_t optlen;
+	int err;
+
+	if (!umem)
+		return 0;
+
+	if (umem->refcount)
+		return -EBUSY;
+
+	optlen = sizeof(off);
+	err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (!err) {
+		munmap(umem->fill->ring,
+		       off.fr.desc + umem->config.fill_size * sizeof(__u64));
+		munmap(umem->comp->ring,
+		       off.cr.desc + umem->config.comp_size * sizeof(__u64));
+	}
+
+	close(umem->fd);
+	free(umem);
+
+	return 0;
+}
+
+void xsk_socket__delete(struct xsk_socket *xsk)
+{
+	struct xdp_mmap_offsets off;
+	socklen_t optlen;
+	int err;
+
+	if (!xsk)
+		return;
+
+	(void)xsk_update_bpf_maps(xsk, 0, 0);
+
+	optlen = sizeof(off);
+	err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen);
+	if (!err) {
+		if (xsk->rx)
+			munmap(xsk->rx->ring,
+			       off.rx.desc +
+			       xsk->config.rx_size * sizeof(struct xdp_desc));
+		if (xsk->tx)
+			munmap(xsk->tx->ring,
+			       off.tx.desc +
+			       xsk->config.tx_size * sizeof(struct xdp_desc));
+	}
+
+	xsk->umem->refcount--;
+	/* Do not close an fd that also has an associated umem connected
+	 * to it.
+	 */
+	if (xsk->fd != xsk->umem->fd)
+		close(xsk->fd);
+	free(xsk);
+}
diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
new file mode 100644
index 000000000000..a497f00e2962
--- /dev/null
+++ b/tools/lib/bpf/xsk.h
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+
+/*
+ * AF_XDP user-space access library.
+ *
+ * Copyright(c) 2018 - 2019 Intel Corporation.
+ *
+ * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
+ */
+
+#ifndef __LIBBPF_XSK_H
+#define __LIBBPF_XSK_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <linux/if_xdp.h>
+
+#include "libbpf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Do not access these members directly. Use the functions below. */
+#define DEFINE_XSK_RING(name) \
+struct name { \
+	__u32 cached_prod; \
+	__u32 cached_cons; \
+	__u32 mask; \
+	__u32 size; \
+	__u32 *producer; \
+	__u32 *consumer; \
+	void *ring; \
+}
+
+DEFINE_XSK_RING(xsk_ring_prod);
+DEFINE_XSK_RING(xsk_ring_cons);
+
+struct xsk_umem;
+struct xsk_socket;
+
+static inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill,
+					      __u32 idx)
+{
+	__u64 *addrs = (__u64 *)fill->ring;
+
+	return &addrs[idx & fill->mask];
+}
+
+static inline const __u64 *
+xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx)
+{
+	const __u64 *addrs = (const __u64 *)comp->ring;
+
+	return &addrs[idx & comp->mask];
+}
+
+static inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx,
+						      __u32 idx)
+{
+	struct xdp_desc *descs = (struct xdp_desc *)tx->ring;
+
+	return &descs[idx & tx->mask];
+}
+
+static inline const struct xdp_desc *
+xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx)
+{
+	const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring;
+
+	return &descs[idx & rx->mask];
+}
+
+static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
+{
+	__u32 free_entries = r->cached_cons - r->cached_prod;
+
+	if (free_entries >= nb)
+		return free_entries;
+
+	/* Refresh the local tail pointer.
+	 * cached_cons is r->size bigger than the real consumer pointer so
+	 * that this addition can be avoided in the more frequently
+	 * executed code that computs free_entries in the beginning of
+	 * this function. Without this optimization it whould have been
+	 * free_entries = r->cached_prod - r->cached_cons + r->size.
+	 */
+	r->cached_cons = *r->consumer + r->size;
+
+	return r->cached_cons - r->cached_prod;
+}
+
+static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb)
+{
+	__u32 entries = r->cached_prod - r->cached_cons;
+
+	if (entries == 0) {
+		r->cached_prod = *r->producer;
+		entries = r->cached_prod - r->cached_cons;
+	}
+
+	return (entries > nb) ? nb : entries;
+}
+
+static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod,
+					    size_t nb, __u32 *idx)
+{
+	if (unlikely(xsk_prod_nb_free(prod, nb) < nb))
+		return 0;
+
+	*idx = prod->cached_prod;
+	prod->cached_prod += nb;
+
+	return nb;
+}
+
+static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb)
+{
+	/* Make sure everything has been written to the ring before signalling
+	 * this to the kernel.
+	 */
+	smp_wmb();
+
+	*prod->producer += nb;
+}
+
+static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons,
+					 size_t nb, __u32 *idx)
+{
+	size_t entries = xsk_cons_nb_avail(cons, nb);
+
+	if (likely(entries > 0)) {
+		/* Make sure we do not speculatively read the data before
+		 * we have received the packet buffers from the ring.
+		 */
+		smp_rmb();
+
+		*idx = cons->cached_cons;
+		cons->cached_cons += entries;
+	}
+
+	return entries;
+}
+
+static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb)
+{
+	*cons->consumer += nb;
+}
+
+static inline void *xsk_umem__get_data(void *umem_area, __u64 addr)
+{
+	return &((char *)umem_area)[addr];
+}
+
+LIBBPF_API int xsk_umem__fd(const struct xsk_umem *umem);
+LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
+
+#define XSK_RING_CONS__DEFAULT_NUM_DESCS      2048
+#define XSK_RING_PROD__DEFAULT_NUM_DESCS      2048
+#define XSK_UMEM__DEFAULT_FRAME_SHIFT    11 /* 2048 bytes */
+#define XSK_UMEM__DEFAULT_FRAME_SIZE     (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
+#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
+
+struct xsk_umem_config {
+	__u32 fill_size;
+	__u32 comp_size;
+	__u32 frame_size;
+	__u32 frame_headroom;
+};
+
+/* Flags for the libbpf_flags field. */
+#define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0)
+
+struct xsk_socket_config {
+	__u32 rx_size;
+	__u32 tx_size;
+	__u32 libbpf_flags;
+	__u32 xdp_flags;
+	__u16 bind_flags;
+};
+
+/* Set config to NULL to get the default configuration. */
+LIBBPF_API int xsk_umem__create(struct xsk_umem **umem,
+				void *umem_area, __u64 size,
+				struct xsk_ring_prod *fill,
+				struct xsk_ring_cons *comp,
+				const struct xsk_umem_config *config);
+LIBBPF_API int xsk_socket__create(struct xsk_socket **xsk,
+				  const char *ifname, __u32 queue_id,
+				  struct xsk_umem *umem,
+				  struct xsk_ring_cons *rx,
+				  struct xsk_ring_prod *tx,
+				  const struct xsk_socket_config *config);
+
+/* Returns 0 for success and -EBUSY if the umem is still in use. */
+LIBBPF_API int xsk_umem__delete(struct xsk_umem *umem);
+LIBBPF_API void xsk_socket__delete(struct xsk_socket *xsk);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* __LIBBPF_XSK_H */
-- 
cgit v1.2.3-59-g8ed1b


From 248c7f9c0e215fcfd847bd3a41cf0160a2359e1a Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Thu, 21 Feb 2019 10:21:27 +0100
Subject: samples/bpf: convert xdpsock to use libbpf for AF_XDP access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit converts the xdpsock sample application to use the AF_XDP
functions present in libbpf. This cuts down the size of it by nearly
300 lines of code.

The default ring sizes plus the batch size has been increased and the
size of the umem area has decreased. This so that the sample application
will provide higher throughput. Note also that the shared umem code
has been removed from the sample as this is not supported by libbpf
at this point in time.

Tested-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile       |   1 -
 samples/bpf/xdpsock.h      |  11 -
 samples/bpf/xdpsock_kern.c |  56 ---
 samples/bpf/xdpsock_user.c | 841 ++++++++++++++-------------------------------
 4 files changed, 261 insertions(+), 648 deletions(-)
 delete mode 100644 samples/bpf/xdpsock.h
 delete mode 100644 samples/bpf/xdpsock_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index a0ef7eddd0b3..a333e258f319 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -163,7 +163,6 @@ always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
-always += xdpsock_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
 always += xdp_sample_pkts_kern.o
diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h
deleted file mode 100644
index 533ab81adfa1..000000000000
--- a/samples/bpf/xdpsock.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef XDPSOCK_H_
-#define XDPSOCK_H_
-
-/* Power-of-2 number of sockets */
-#define MAX_SOCKS 4
-
-/* Round-robin receive */
-#define RR_LB 0
-
-#endif /* XDPSOCK_H_ */
diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c
deleted file mode 100644
index b8ccd0802b3f..000000000000
--- a/samples/bpf/xdpsock_kern.c
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define KBUILD_MODNAME "foo"
-#include <uapi/linux/bpf.h>
-#include "bpf_helpers.h"
-
-#include "xdpsock.h"
-
-struct bpf_map_def SEC("maps") qidconf_map = {
-	.type		= BPF_MAP_TYPE_ARRAY,
-	.key_size	= sizeof(int),
-	.value_size	= sizeof(int),
-	.max_entries	= 1,
-};
-
-struct bpf_map_def SEC("maps") xsks_map = {
-	.type = BPF_MAP_TYPE_XSKMAP,
-	.key_size = sizeof(int),
-	.value_size = sizeof(int),
-	.max_entries = MAX_SOCKS,
-};
-
-struct bpf_map_def SEC("maps") rr_map = {
-	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
-	.key_size = sizeof(int),
-	.value_size = sizeof(unsigned int),
-	.max_entries = 1,
-};
-
-SEC("xdp_sock")
-int xdp_sock_prog(struct xdp_md *ctx)
-{
-	int *qidconf, key = 0, idx;
-	unsigned int *rr;
-
-	qidconf = bpf_map_lookup_elem(&qidconf_map, &key);
-	if (!qidconf)
-		return XDP_ABORTED;
-
-	if (*qidconf != ctx->rx_queue_index)
-		return XDP_PASS;
-
-#if RR_LB /* NB! RR_LB is configured in xdpsock.h */
-	rr = bpf_map_lookup_elem(&rr_map, &key);
-	if (!rr)
-		return XDP_ABORTED;
-
-	*rr = (*rr + 1) & (MAX_SOCKS - 1);
-	idx = *rr;
-#else
-	idx = 0;
-#endif
-
-	return bpf_redirect_map(&xsks_map, idx, 0);
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index f73055e0191f..9c76d6d43deb 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -1,37 +1,36 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2017 - 2018 Intel Corporation. */
 
-#include <assert.h>
+#include <asm/barrier.h>
 #include <errno.h>
 #include <getopt.h>
 #include <libgen.h>
 #include <linux/bpf.h>
+#include <linux/compiler.h>
 #include <linux/if_link.h>
 #include <linux/if_xdp.h>
 #include <linux/if_ether.h>
+#include <locale.h>
+#include <net/ethernet.h>
 #include <net/if.h>
+#include <poll.h>
+#include <pthread.h>
 #include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <net/ethernet.h>
+#include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
-#include <sys/mman.h>
+#include <sys/types.h>
 #include <time.h>
 #include <unistd.h>
-#include <pthread.h>
-#include <locale.h>
-#include <sys/types.h>
-#include <poll.h>
 
 #include "bpf/libbpf.h"
-#include "bpf_util.h"
+#include "bpf/xsk.h"
 #include <bpf/bpf.h>
 
-#include "xdpsock.h"
-
 #ifndef SOL_XDP
 #define SOL_XDP 283
 #endif
@@ -44,17 +43,11 @@
 #define PF_XDP AF_XDP
 #endif
 
-#define NUM_FRAMES 131072
-#define FRAME_HEADROOM 0
-#define FRAME_SHIFT 11
-#define FRAME_SIZE 2048
-#define NUM_DESCS 1024
-#define BATCH_SIZE 16
-
-#define FQ_NUM_DESCS 1024
-#define CQ_NUM_DESCS 1024
+#define NUM_FRAMES (4 * 1024)
+#define BATCH_SIZE 64
 
 #define DEBUG_HEXDUMP 0
+#define MAX_SOCKS 8
 
 typedef __u64 u64;
 typedef __u32 u32;
@@ -73,54 +66,31 @@ static const char *opt_if = "";
 static int opt_ifindex;
 static int opt_queue;
 static int opt_poll;
-static int opt_shared_packet_buffer;
 static int opt_interval = 1;
 static u32 opt_xdp_bind_flags;
 static __u32 prog_id;
 
-struct xdp_umem_uqueue {
-	u32 cached_prod;
-	u32 cached_cons;
-	u32 mask;
-	u32 size;
-	u32 *producer;
-	u32 *consumer;
-	u64 *ring;
-	void *map;
+struct xsk_umem_info {
+	struct xsk_ring_prod fq;
+	struct xsk_ring_cons cq;
+	struct xsk_umem *umem;
+	void *buffer;
 };
 
-struct xdp_umem {
-	char *frames;
-	struct xdp_umem_uqueue fq;
-	struct xdp_umem_uqueue cq;
-	int fd;
-};
-
-struct xdp_uqueue {
-	u32 cached_prod;
-	u32 cached_cons;
-	u32 mask;
-	u32 size;
-	u32 *producer;
-	u32 *consumer;
-	struct xdp_desc *ring;
-	void *map;
-};
-
-struct xdpsock {
-	struct xdp_uqueue rx;
-	struct xdp_uqueue tx;
-	int sfd;
-	struct xdp_umem *umem;
-	u32 outstanding_tx;
+struct xsk_socket_info {
+	struct xsk_ring_cons rx;
+	struct xsk_ring_prod tx;
+	struct xsk_umem_info *umem;
+	struct xsk_socket *xsk;
 	unsigned long rx_npkts;
 	unsigned long tx_npkts;
 	unsigned long prev_rx_npkts;
 	unsigned long prev_tx_npkts;
+	u32 outstanding_tx;
 };
 
 static int num_socks;
-struct xdpsock *xsks[MAX_SOCKS];
+struct xsk_socket_info *xsks[MAX_SOCKS];
 
 static unsigned long get_nsecs(void)
 {
@@ -130,225 +100,124 @@ static unsigned long get_nsecs(void)
 	return ts.tv_sec * 1000000000UL + ts.tv_nsec;
 }
 
-static void dump_stats(void);
-
-#define lassert(expr)							\
-	do {								\
-		if (!(expr)) {						\
-			fprintf(stderr, "%s:%s:%i: Assertion failed: "	\
-				#expr ": errno: %d/\"%s\"\n",		\
-				__FILE__, __func__, __LINE__,		\
-				errno, strerror(errno));		\
-			dump_stats();					\
-			exit(EXIT_FAILURE);				\
-		}							\
-	} while (0)
-
-#define barrier() __asm__ __volatile__("": : :"memory")
-#ifdef __aarch64__
-#define u_smp_rmb() __asm__ __volatile__("dmb ishld": : :"memory")
-#define u_smp_wmb() __asm__ __volatile__("dmb ishst": : :"memory")
-#else
-#define u_smp_rmb() barrier()
-#define u_smp_wmb() barrier()
-#endif
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-
-static const char pkt_data[] =
-	"\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
-	"\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
-	"\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
-	"\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
-
-static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
-{
-	u32 free_entries = q->cached_cons - q->cached_prod;
-
-	if (free_entries >= nb)
-		return free_entries;
-
-	/* Refresh the local tail pointer */
-	q->cached_cons = *q->consumer + q->size;
-
-	return q->cached_cons - q->cached_prod;
-}
-
-static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
+static void print_benchmark(bool running)
 {
-	u32 free_entries = q->cached_cons - q->cached_prod;
+	const char *bench_str = "INVALID";
 
-	if (free_entries >= ndescs)
-		return free_entries;
+	if (opt_bench == BENCH_RXDROP)
+		bench_str = "rxdrop";
+	else if (opt_bench == BENCH_TXONLY)
+		bench_str = "txonly";
+	else if (opt_bench == BENCH_L2FWD)
+		bench_str = "l2fwd";
 
-	/* Refresh the local tail pointer */
-	q->cached_cons = *q->consumer + q->size;
-	return q->cached_cons - q->cached_prod;
-}
+	printf("%s:%d %s ", opt_if, opt_queue, bench_str);
+	if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
+		printf("xdp-skb ");
+	else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
+		printf("xdp-drv ");
+	else
+		printf("	");
 
-static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb)
-{
-	u32 entries = q->cached_prod - q->cached_cons;
+	if (opt_poll)
+		printf("poll() ");
 
-	if (entries == 0) {
-		q->cached_prod = *q->producer;
-		entries = q->cached_prod - q->cached_cons;
+	if (running) {
+		printf("running...");
+		fflush(stdout);
 	}
-
-	return (entries > nb) ? nb : entries;
 }
 
-static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs)
+static void dump_stats(void)
 {
-	u32 entries = q->cached_prod - q->cached_cons;
+	unsigned long now = get_nsecs();
+	long dt = now - prev_time;
+	int i;
 
-	if (entries == 0) {
-		q->cached_prod = *q->producer;
-		entries = q->cached_prod - q->cached_cons;
-	}
+	prev_time = now;
 
-	return (entries > ndescs) ? ndescs : entries;
-}
+	for (i = 0; i < num_socks && xsks[i]; i++) {
+		char *fmt = "%-15s %'-11.0f %'-11lu\n";
+		double rx_pps, tx_pps;
 
-static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
-					 struct xdp_desc *d,
-					 size_t nb)
-{
-	u32 i;
+		rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
+			 1000000000. / dt;
+		tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
+			 1000000000. / dt;
 
-	if (umem_nb_free(fq, nb) < nb)
-		return -ENOSPC;
+		printf("\n sock%d@", i);
+		print_benchmark(false);
+		printf("\n");
 
-	for (i = 0; i < nb; i++) {
-		u32 idx = fq->cached_prod++ & fq->mask;
+		printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
+		       dt / 1000000000.);
+		printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
+		printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
 
-		fq->ring[idx] = d[i].addr;
+		xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
+		xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
 	}
-
-	u_smp_wmb();
-
-	*fq->producer = fq->cached_prod;
-
-	return 0;
 }
 
-static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u64 *d,
-				      size_t nb)
+static void *poller(void *arg)
 {
-	u32 i;
-
-	if (umem_nb_free(fq, nb) < nb)
-		return -ENOSPC;
-
-	for (i = 0; i < nb; i++) {
-		u32 idx = fq->cached_prod++ & fq->mask;
-
-		fq->ring[idx] = d[i];
+	(void)arg;
+	for (;;) {
+		sleep(opt_interval);
+		dump_stats();
 	}
 
-	u_smp_wmb();
-
-	*fq->producer = fq->cached_prod;
-
-	return 0;
+	return NULL;
 }
 
-static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
-					       u64 *d, size_t nb)
+static void remove_xdp_program(void)
 {
-	u32 idx, i, entries = umem_nb_avail(cq, nb);
-
-	u_smp_rmb();
-
-	for (i = 0; i < entries; i++) {
-		idx = cq->cached_cons++ & cq->mask;
-		d[i] = cq->ring[idx];
-	}
-
-	if (entries > 0) {
-		u_smp_wmb();
+	__u32 curr_prog_id = 0;
 
-		*cq->consumer = cq->cached_cons;
+	if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
+		printf("bpf_get_link_xdp_id failed\n");
+		exit(EXIT_FAILURE);
 	}
-
-	return entries;
-}
-
-static inline void *xq_get_data(struct xdpsock *xsk, u64 addr)
-{
-	return &xsk->umem->frames[addr];
+	if (prog_id == curr_prog_id)
+		bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+	else if (!curr_prog_id)
+		printf("couldn't find a prog id on a given interface\n");
+	else
+		printf("program on interface changed, not removing\n");
 }
 
-static inline int xq_enq(struct xdp_uqueue *uq,
-			 const struct xdp_desc *descs,
-			 unsigned int ndescs)
+static void int_exit(int sig)
 {
-	struct xdp_desc *r = uq->ring;
-	unsigned int i;
+	struct xsk_umem *umem = xsks[0]->umem->umem;
 
-	if (xq_nb_free(uq, ndescs) < ndescs)
-		return -ENOSPC;
-
-	for (i = 0; i < ndescs; i++) {
-		u32 idx = uq->cached_prod++ & uq->mask;
-
-		r[idx].addr = descs[i].addr;
-		r[idx].len = descs[i].len;
-	}
+	(void)sig;
 
-	u_smp_wmb();
+	dump_stats();
+	xsk_socket__delete(xsks[0]->xsk);
+	(void)xsk_umem__delete(umem);
+	remove_xdp_program();
 
-	*uq->producer = uq->cached_prod;
-	return 0;
+	exit(EXIT_SUCCESS);
 }
 
-static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
-				 unsigned int id, unsigned int ndescs)
+static void __exit_with_error(int error, const char *file, const char *func,
+			      int line)
 {
-	struct xdp_desc *r = uq->ring;
-	unsigned int i;
-
-	if (xq_nb_free(uq, ndescs) < ndescs)
-		return -ENOSPC;
-
-	for (i = 0; i < ndescs; i++) {
-		u32 idx = uq->cached_prod++ & uq->mask;
-
-		r[idx].addr	= (id + i) << FRAME_SHIFT;
-		r[idx].len	= sizeof(pkt_data) - 1;
-	}
-
-	u_smp_wmb();
-
-	*uq->producer = uq->cached_prod;
-	return 0;
+	fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
+		line, error, strerror(error));
+	dump_stats();
+	remove_xdp_program();
+	exit(EXIT_FAILURE);
 }
 
-static inline int xq_deq(struct xdp_uqueue *uq,
-			 struct xdp_desc *descs,
-			 int ndescs)
-{
-	struct xdp_desc *r = uq->ring;
-	unsigned int idx;
-	int i, entries;
-
-	entries = xq_nb_avail(uq, ndescs);
-
-	u_smp_rmb();
-
-	for (i = 0; i < entries; i++) {
-		idx = uq->cached_cons++ & uq->mask;
-		descs[i] = r[idx];
-	}
-
-	if (entries > 0) {
-		u_smp_wmb();
+#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \
+						 __LINE__)
 
-		*uq->consumer = uq->cached_cons;
-	}
-
-	return entries;
-}
+static const char pkt_data[] =
+	"\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
+	"\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
+	"\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
+	"\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
 
 static void swap_mac_addresses(void *data)
 {
@@ -397,258 +266,74 @@ static void hex_dump(void *pkt, size_t length, u64 addr)
 	printf("\n");
 }
 
-static size_t gen_eth_frame(char *frame)
+static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
 {
-	memcpy(frame, pkt_data, sizeof(pkt_data) - 1);
+	memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data,
+	       sizeof(pkt_data) - 1);
 	return sizeof(pkt_data) - 1;
 }
 
-static struct xdp_umem *xdp_umem_configure(int sfd)
+static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
 {
-	int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS;
-	struct xdp_mmap_offsets off;
-	struct xdp_umem_reg mr;
-	struct xdp_umem *umem;
-	socklen_t optlen;
-	void *bufs;
+	struct xsk_umem_info *umem;
+	int ret;
 
 	umem = calloc(1, sizeof(*umem));
-	lassert(umem);
-
-	lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
-			       NUM_FRAMES * FRAME_SIZE) == 0);
-
-	mr.addr = (__u64)bufs;
-	mr.len = NUM_FRAMES * FRAME_SIZE;
-	mr.chunk_size = FRAME_SIZE;
-	mr.headroom = FRAME_HEADROOM;
-
-	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0);
-	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
-			   sizeof(int)) == 0);
-	lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
-			   sizeof(int)) == 0);
-
-	optlen = sizeof(off);
-	lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
-			   &optlen) == 0);
-
-	umem->fq.map = mmap(0, off.fr.desc +
-			    FQ_NUM_DESCS * sizeof(u64),
-			    PROT_READ | PROT_WRITE,
-			    MAP_SHARED | MAP_POPULATE, sfd,
-			    XDP_UMEM_PGOFF_FILL_RING);
-	lassert(umem->fq.map != MAP_FAILED);
-
-	umem->fq.mask = FQ_NUM_DESCS - 1;
-	umem->fq.size = FQ_NUM_DESCS;
-	umem->fq.producer = umem->fq.map + off.fr.producer;
-	umem->fq.consumer = umem->fq.map + off.fr.consumer;
-	umem->fq.ring = umem->fq.map + off.fr.desc;
-	umem->fq.cached_cons = FQ_NUM_DESCS;
-
-	umem->cq.map = mmap(0, off.cr.desc +
-			     CQ_NUM_DESCS * sizeof(u64),
-			     PROT_READ | PROT_WRITE,
-			     MAP_SHARED | MAP_POPULATE, sfd,
-			     XDP_UMEM_PGOFF_COMPLETION_RING);
-	lassert(umem->cq.map != MAP_FAILED);
-
-	umem->cq.mask = CQ_NUM_DESCS - 1;
-	umem->cq.size = CQ_NUM_DESCS;
-	umem->cq.producer = umem->cq.map + off.cr.producer;
-	umem->cq.consumer = umem->cq.map + off.cr.consumer;
-	umem->cq.ring = umem->cq.map + off.cr.desc;
-
-	umem->frames = bufs;
-	umem->fd = sfd;
+	if (!umem)
+		exit_with_error(errno);
 
-	if (opt_bench == BENCH_TXONLY) {
-		int i;
-
-		for (i = 0; i < NUM_FRAMES * FRAME_SIZE; i += FRAME_SIZE)
-			(void)gen_eth_frame(&umem->frames[i]);
-	}
+	ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
+			       NULL);
+	if (ret)
+		exit_with_error(-ret);
 
+	umem->buffer = buffer;
 	return umem;
 }
 
-static struct xdpsock *xsk_configure(struct xdp_umem *umem)
+static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem)
 {
-	struct sockaddr_xdp sxdp = {};
-	struct xdp_mmap_offsets off;
-	int sfd, ndescs = NUM_DESCS;
-	struct xdpsock *xsk;
-	bool shared = true;
-	socklen_t optlen;
-	u64 i;
-
-	sfd = socket(PF_XDP, SOCK_RAW, 0);
-	lassert(sfd >= 0);
+	struct xsk_socket_config cfg;
+	struct xsk_socket_info *xsk;
+	int ret;
+	u32 idx;
+	int i;
 
 	xsk = calloc(1, sizeof(*xsk));
-	lassert(xsk);
-
-	xsk->sfd = sfd;
-	xsk->outstanding_tx = 0;
-
-	if (!umem) {
-		shared = false;
-		xsk->umem = xdp_umem_configure(sfd);
-	} else {
-		xsk->umem = umem;
-	}
-
-	lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING,
-			   &ndescs, sizeof(int)) == 0);
-	lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING,
-			   &ndescs, sizeof(int)) == 0);
-	optlen = sizeof(off);
-	lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
-			   &optlen) == 0);
-
-	/* Rx */
-	xsk->rx.map = mmap(NULL,
-			   off.rx.desc +
-			   NUM_DESCS * sizeof(struct xdp_desc),
-			   PROT_READ | PROT_WRITE,
-			   MAP_SHARED | MAP_POPULATE, sfd,
-			   XDP_PGOFF_RX_RING);
-	lassert(xsk->rx.map != MAP_FAILED);
-
-	if (!shared) {
-		for (i = 0; i < NUM_DESCS * FRAME_SIZE; i += FRAME_SIZE)
-			lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1)
-				== 0);
-	}
-
-	/* Tx */
-	xsk->tx.map = mmap(NULL,
-			   off.tx.desc +
-			   NUM_DESCS * sizeof(struct xdp_desc),
-			   PROT_READ | PROT_WRITE,
-			   MAP_SHARED | MAP_POPULATE, sfd,
-			   XDP_PGOFF_TX_RING);
-	lassert(xsk->tx.map != MAP_FAILED);
-
-	xsk->rx.mask = NUM_DESCS - 1;
-	xsk->rx.size = NUM_DESCS;
-	xsk->rx.producer = xsk->rx.map + off.rx.producer;
-	xsk->rx.consumer = xsk->rx.map + off.rx.consumer;
-	xsk->rx.ring = xsk->rx.map + off.rx.desc;
-
-	xsk->tx.mask = NUM_DESCS - 1;
-	xsk->tx.size = NUM_DESCS;
-	xsk->tx.producer = xsk->tx.map + off.tx.producer;
-	xsk->tx.consumer = xsk->tx.map + off.tx.consumer;
-	xsk->tx.ring = xsk->tx.map + off.tx.desc;
-	xsk->tx.cached_cons = NUM_DESCS;
-
-	sxdp.sxdp_family = PF_XDP;
-	sxdp.sxdp_ifindex = opt_ifindex;
-	sxdp.sxdp_queue_id = opt_queue;
-
-	if (shared) {
-		sxdp.sxdp_flags = XDP_SHARED_UMEM;
-		sxdp.sxdp_shared_umem_fd = umem->fd;
-	} else {
-		sxdp.sxdp_flags = opt_xdp_bind_flags;
-	}
-
-	lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0);
+	if (!xsk)
+		exit_with_error(errno);
+
+	xsk->umem = umem;
+	cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+	cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+	cfg.libbpf_flags = 0;
+	cfg.xdp_flags = opt_xdp_flags;
+	cfg.bind_flags = opt_xdp_bind_flags;
+	ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem,
+				 &xsk->rx, &xsk->tx, &cfg);
+	if (ret)
+		exit_with_error(-ret);
+
+	ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
+	if (ret)
+		exit_with_error(-ret);
+
+	ret = xsk_ring_prod__reserve(&xsk->umem->fq,
+				     XSK_RING_PROD__DEFAULT_NUM_DESCS,
+				     &idx);
+	if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
+		exit_with_error(-ret);
+	for (i = 0;
+	     i < XSK_RING_PROD__DEFAULT_NUM_DESCS *
+		     XSK_UMEM__DEFAULT_FRAME_SIZE;
+	     i += XSK_UMEM__DEFAULT_FRAME_SIZE)
+		*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i;
+	xsk_ring_prod__submit(&xsk->umem->fq,
+			      XSK_RING_PROD__DEFAULT_NUM_DESCS);
 
 	return xsk;
 }
 
-static void print_benchmark(bool running)
-{
-	const char *bench_str = "INVALID";
-
-	if (opt_bench == BENCH_RXDROP)
-		bench_str = "rxdrop";
-	else if (opt_bench == BENCH_TXONLY)
-		bench_str = "txonly";
-	else if (opt_bench == BENCH_L2FWD)
-		bench_str = "l2fwd";
-
-	printf("%s:%d %s ", opt_if, opt_queue, bench_str);
-	if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
-		printf("xdp-skb ");
-	else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
-		printf("xdp-drv ");
-	else
-		printf("	");
-
-	if (opt_poll)
-		printf("poll() ");
-
-	if (running) {
-		printf("running...");
-		fflush(stdout);
-	}
-}
-
-static void dump_stats(void)
-{
-	unsigned long now = get_nsecs();
-	long dt = now - prev_time;
-	int i;
-
-	prev_time = now;
-
-	for (i = 0; i < num_socks && xsks[i]; i++) {
-		char *fmt = "%-15s %'-11.0f %'-11lu\n";
-		double rx_pps, tx_pps;
-
-		rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
-			 1000000000. / dt;
-		tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
-			 1000000000. / dt;
-
-		printf("\n sock%d@", i);
-		print_benchmark(false);
-		printf("\n");
-
-		printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
-		       dt / 1000000000.);
-		printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
-		printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
-
-		xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
-		xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
-	}
-}
-
-static void *poller(void *arg)
-{
-	(void)arg;
-	for (;;) {
-		sleep(opt_interval);
-		dump_stats();
-	}
-
-	return NULL;
-}
-
-static void int_exit(int sig)
-{
-	__u32 curr_prog_id = 0;
-
-	(void)sig;
-	dump_stats();
-	if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
-		printf("bpf_get_link_xdp_id failed\n");
-		exit(EXIT_FAILURE);
-	}
-	if (prog_id == curr_prog_id)
-		bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
-	else if (!curr_prog_id)
-		printf("couldn't find a prog id on a given interface\n");
-	else
-		printf("program on interface changed, not removing\n");
-	exit(EXIT_SUCCESS);
-}
-
 static struct option long_options[] = {
 	{"rxdrop", no_argument, 0, 'r'},
 	{"txonly", no_argument, 0, 't'},
@@ -656,7 +341,6 @@ static struct option long_options[] = {
 	{"interface", required_argument, 0, 'i'},
 	{"queue", required_argument, 0, 'q'},
 	{"poll", no_argument, 0, 'p'},
-	{"shared-buffer", no_argument, 0, 's'},
 	{"xdp-skb", no_argument, 0, 'S'},
 	{"xdp-native", no_argument, 0, 'N'},
 	{"interval", required_argument, 0, 'n'},
@@ -676,7 +360,6 @@ static void usage(const char *prog)
 		"  -i, --interface=n	Run on interface n\n"
 		"  -q, --queue=n	Use queue n (default 0)\n"
 		"  -p, --poll		Use poll syscall\n"
-		"  -s, --shared-buffer	Use shared packet buffer\n"
 		"  -S, --xdp-skb=n	Use XDP skb-mod\n"
 		"  -N, --xdp-native=n	Enfore XDP native mode\n"
 		"  -n, --interval=n	Specify statistics update interval (default 1 sec).\n"
@@ -715,9 +398,6 @@ static void parse_command_line(int argc, char **argv)
 		case 'q':
 			opt_queue = atoi(optarg);
 			break;
-		case 's':
-			opt_shared_packet_buffer = 1;
-			break;
 		case 'p':
 			opt_poll = 1;
 			break;
@@ -751,75 +431,104 @@ static void parse_command_line(int argc, char **argv)
 			opt_if);
 		usage(basename(argv[0]));
 	}
+
 }
 
-static void kick_tx(int fd)
+static void kick_tx(struct xsk_socket_info *xsk)
 {
 	int ret;
 
-	ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+	ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
 	if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY)
 		return;
-	lassert(0);
+	exit_with_error(errno);
 }
 
-static inline void complete_tx_l2fwd(struct xdpsock *xsk)
+static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
 {
-	u64 descs[BATCH_SIZE];
+	u32 idx_cq, idx_fq;
 	unsigned int rcvd;
 	size_t ndescs;
 
 	if (!xsk->outstanding_tx)
 		return;
 
-	kick_tx(xsk->sfd);
+	kick_tx(xsk);
 	ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
-		 xsk->outstanding_tx;
+		xsk->outstanding_tx;
 
 	/* re-add completed Tx buffers */
-	rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs);
+	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq);
 	if (rcvd > 0) {
-		umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd);
+		unsigned int i;
+		int ret;
+
+		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+		while (ret != rcvd) {
+			if (ret < 0)
+				exit_with_error(-ret);
+			ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
+						     &idx_fq);
+		}
+		for (i = 0; i < rcvd; i++)
+			*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
+				*xsk_ring_cons__comp_addr(&xsk->umem->cq,
+							  idx_cq++);
+
+		xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
 		xsk->outstanding_tx -= rcvd;
 		xsk->tx_npkts += rcvd;
 	}
 }
 
-static inline void complete_tx_only(struct xdpsock *xsk)
+static inline void complete_tx_only(struct xsk_socket_info *xsk)
 {
-	u64 descs[BATCH_SIZE];
 	unsigned int rcvd;
+	u32 idx;
 
 	if (!xsk->outstanding_tx)
 		return;
 
-	kick_tx(xsk->sfd);
+	kick_tx(xsk);
 
-	rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE);
+	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
 	if (rcvd > 0) {
+		xsk_ring_cons__release(&xsk->umem->cq, rcvd);
 		xsk->outstanding_tx -= rcvd;
 		xsk->tx_npkts += rcvd;
 	}
 }
 
-static void rx_drop(struct xdpsock *xsk)
+static void rx_drop(struct xsk_socket_info *xsk)
 {
-	struct xdp_desc descs[BATCH_SIZE];
 	unsigned int rcvd, i;
+	u32 idx_rx, idx_fq = 0;
+	int ret;
 
-	rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
 	if (!rcvd)
 		return;
 
+	ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+	while (ret != rcvd) {
+		if (ret < 0)
+			exit_with_error(-ret);
+		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+	}
+
 	for (i = 0; i < rcvd; i++) {
-		char *pkt = xq_get_data(xsk, descs[i].addr);
+		u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
+		u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
+		char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
 
-		hex_dump(pkt, descs[i].len, descs[i].addr);
+		hex_dump(pkt, len, addr);
+		*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr;
 	}
 
+	xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+	xsk_ring_cons__release(&xsk->rx, rcvd);
 	xsk->rx_npkts += rcvd;
-
-	umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd);
 }
 
 static void rx_drop_all(void)
@@ -830,7 +539,7 @@ static void rx_drop_all(void)
 	memset(fds, 0, sizeof(fds));
 
 	for (i = 0; i < num_socks; i++) {
-		fds[i].fd = xsks[i]->sfd;
+		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
 		fds[i].events = POLLIN;
 		timeout = 1000; /* 1sn */
 	}
@@ -847,14 +556,14 @@ static void rx_drop_all(void)
 	}
 }
 
-static void tx_only(struct xdpsock *xsk)
+static void tx_only(struct xsk_socket_info *xsk)
 {
 	int timeout, ret, nfds = 1;
 	struct pollfd fds[nfds + 1];
-	unsigned int idx = 0;
+	u32 idx, frame_nb = 0;
 
 	memset(fds, 0, sizeof(fds));
-	fds[0].fd = xsk->sfd;
+	fds[0].fd = xsk_socket__fd(xsk->xsk);
 	fds[0].events = POLLOUT;
 	timeout = 1000; /* 1sn */
 
@@ -864,50 +573,73 @@ static void tx_only(struct xdpsock *xsk)
 			if (ret <= 0)
 				continue;
 
-			if (fds[0].fd != xsk->sfd ||
-			    !(fds[0].revents & POLLOUT))
+			if (!(fds[0].revents & POLLOUT))
 				continue;
 		}
 
-		if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) {
-			lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0);
+		if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
+		    BATCH_SIZE) {
+			unsigned int i;
 
+			for (i = 0; i < BATCH_SIZE; i++) {
+				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
+					= (frame_nb + i) <<
+					XSK_UMEM__DEFAULT_FRAME_SHIFT;
+				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
+					sizeof(pkt_data) - 1;
+			}
+
+			xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
 			xsk->outstanding_tx += BATCH_SIZE;
-			idx += BATCH_SIZE;
-			idx %= NUM_FRAMES;
+			frame_nb += BATCH_SIZE;
+			frame_nb %= NUM_FRAMES;
 		}
 
 		complete_tx_only(xsk);
 	}
 }
 
-static void l2fwd(struct xdpsock *xsk)
+static void l2fwd(struct xsk_socket_info *xsk)
 {
 	for (;;) {
-		struct xdp_desc descs[BATCH_SIZE];
 		unsigned int rcvd, i;
+		u32 idx_rx, idx_tx = 0;
 		int ret;
 
 		for (;;) {
 			complete_tx_l2fwd(xsk);
 
-			rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+			rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
+						   &idx_rx);
 			if (rcvd > 0)
 				break;
 		}
 
+		ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+		while (ret != rcvd) {
+			if (ret < 0)
+				exit_with_error(-ret);
+			ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+		}
+
 		for (i = 0; i < rcvd; i++) {
-			char *pkt = xq_get_data(xsk, descs[i].addr);
+			u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
+							  idx_rx)->addr;
+			u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
+							 idx_rx++)->len;
+			char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
 
 			swap_mac_addresses(pkt);
 
-			hex_dump(pkt, descs[i].len, descs[i].addr);
+			hex_dump(pkt, len, addr);
+			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
+			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
 		}
 
-		xsk->rx_npkts += rcvd;
+		xsk_ring_prod__submit(&xsk->tx, rcvd);
+		xsk_ring_cons__release(&xsk->rx, rcvd);
 
-		ret = xq_enq(&xsk->tx, descs, rcvd);
-		lassert(ret == 0);
+		xsk->rx_npkts += rcvd;
 		xsk->outstanding_tx += rcvd;
 	}
 }
@@ -915,17 +647,10 @@ static void l2fwd(struct xdpsock *xsk)
 int main(int argc, char **argv)
 {
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
-	struct bpf_prog_load_attr prog_load_attr = {
-		.prog_type	= BPF_PROG_TYPE_XDP,
-	};
-	int prog_fd, qidconf_map, xsks_map;
-	struct bpf_prog_info info = {};
-	__u32 info_len = sizeof(info);
-	struct bpf_object *obj;
-	char xdp_filename[256];
-	struct bpf_map *map;
-	int i, ret, key = 0;
+	struct xsk_umem_info *umem;
 	pthread_t pt;
+	void *bufs;
+	int ret;
 
 	parse_command_line(argc, argv);
 
@@ -935,67 +660,22 @@ int main(int argc, char **argv)
 		exit(EXIT_FAILURE);
 	}
 
-	snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
-	prog_load_attr.file = xdp_filename;
-
-	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
-		exit(EXIT_FAILURE);
-	if (prog_fd < 0) {
-		fprintf(stderr, "ERROR: no program found: %s\n",
-			strerror(prog_fd));
-		exit(EXIT_FAILURE);
-	}
-
-	map = bpf_object__find_map_by_name(obj, "qidconf_map");
-	qidconf_map = bpf_map__fd(map);
-	if (qidconf_map < 0) {
-		fprintf(stderr, "ERROR: no qidconf map found: %s\n",
-			strerror(qidconf_map));
-		exit(EXIT_FAILURE);
-	}
-
-	map = bpf_object__find_map_by_name(obj, "xsks_map");
-	xsks_map = bpf_map__fd(map);
-	if (xsks_map < 0) {
-		fprintf(stderr, "ERROR: no xsks map found: %s\n",
-			strerror(xsks_map));
-		exit(EXIT_FAILURE);
-	}
-
-	if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) {
-		fprintf(stderr, "ERROR: link set xdp fd failed\n");
-		exit(EXIT_FAILURE);
-	}
-
-	ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-	if (ret) {
-		printf("can't get prog info - %s\n", strerror(errno));
-		return 1;
-	}
-	prog_id = info.id;
+	ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
+			     NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+	if (ret)
+		exit_with_error(ret);
 
-	ret = bpf_map_update_elem(qidconf_map, &key, &opt_queue, 0);
-	if (ret) {
-		fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n");
-		exit(EXIT_FAILURE);
-	}
+       /* Create sockets... */
+	umem = xsk_configure_umem(bufs,
+				  NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+	xsks[num_socks++] = xsk_configure_socket(umem);
 
-	/* Create sockets... */
-	xsks[num_socks++] = xsk_configure(NULL);
-
-#if RR_LB
-	for (i = 0; i < MAX_SOCKS - 1; i++)
-		xsks[num_socks++] = xsk_configure(xsks[0]->umem);
-#endif
+	if (opt_bench == BENCH_TXONLY) {
+		int i;
 
-	/* ...and insert them into the map. */
-	for (i = 0; i < num_socks; i++) {
-		key = i;
-		ret = bpf_map_update_elem(xsks_map, &key, &xsks[i]->sfd, 0);
-		if (ret) {
-			fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
-			exit(EXIT_FAILURE);
-		}
+		for (i = 0; i < NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE;
+		     i += XSK_UMEM__DEFAULT_FRAME_SIZE)
+			(void)gen_eth_frame(umem, i);
 	}
 
 	signal(SIGINT, int_exit);
@@ -1005,7 +685,8 @@ int main(int argc, char **argv)
 	setlocale(LC_ALL, "");
 
 	ret = pthread_create(&pt, NULL, poller, NULL);
-	lassert(ret == 0);
+	if (ret)
+		exit_with_error(ret);
 
 	prev_time = get_nsecs();
 
-- 
cgit v1.2.3-59-g8ed1b


From 0f4a9b7d4ecbac191052cb80b84a46471fd30d80 Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Thu, 21 Feb 2019 10:21:28 +0100
Subject: xsk: add FAQ to facilitate for first time users

Added an FAQ section in Documentation/networking/af_xdp.rst to help
first time users with common problems. As problems are getting
identified, entries will be added to the FAQ.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/networking/af_xdp.rst | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 4ae4f9d8f8fe..e14d7d40fc75 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -295,6 +295,41 @@ using::
 For XDP_SKB mode, use the switch "-S" instead of "-N" and all options
 can be displayed with "-h", as usual.
 
+FAQ
+=======
+
+Q: I am not seeing any traffic on the socket. What am I doing wrong?
+
+A: When a netdev of a physical NIC is initialized, Linux usually
+   allocates one Rx and Tx queue pair per core. So on a 8 core system,
+   queue ids 0 to 7 will be allocated, one per core. In the AF_XDP
+   bind call or the xsk_socket__create libbpf function call, you
+   specify a specific queue id to bind to and it is only the traffic
+   towards that queue you are going to get on you socket. So in the
+   example above, if you bind to queue 0, you are NOT going to get any
+   traffic that is distributed to queues 1 through 7. If you are
+   lucky, you will see the traffic, but usually it will end up on one
+   of the queues you have not bound to.
+
+   There are a number of ways to solve the problem of getting the
+   traffic you want to the queue id you bound to. If you want to see
+   all the traffic, you can force the netdev to only have 1 queue, queue
+   id 0, and then bind to queue 0. You can use ethtool to do this::
+
+   sudo ethtool -L <interface> combined 1
+
+   If you want to only see part of the traffic, you can program the
+   NIC through ethtool to filter out your traffic to a single queue id
+   that you can bind your XDP socket to. Here is one example in which
+   UDP traffic to and from port 4242 are sent to queue 2::
+
+   sudo ethtool -N <interface> rx-flow-hash udp4 fn
+   sudo ethtool -N <interface> flow-type udp4 src-port 4242 dst-port \
+   4242 action 2
+
+   A number of other ways are possible all up to the capabilitites of
+   the NIC you have.
+
 Credits
 =======
 
@@ -309,4 +344,3 @@ Credits
 - Michael S. Tsirkin
 - Qi Z Zhang
 - Willem de Bruijn
-
-- 
cgit v1.2.3-59-g8ed1b


From 492ecee892c2a4ba6a14903d5d586ff750b7e805 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 25 Feb 2019 14:28:39 -0800
Subject: bpf: enable program stats

JITed BPF programs are indistinguishable from kernel functions, but unlike
kernel code BPF code can be changed often.
Typical approach of "perf record" + "perf report" profiling and tuning of
kernel code works just as well for BPF programs, but kernel code doesn't
need to be monitored whereas BPF programs do.
Users load and run large amount of BPF programs.
These BPF stats allow tools monitor the usage of BPF on the server.
The monitoring tools will turn sysctl kernel.bpf_stats_enabled
on and off for few seconds to sample average cost of the programs.
Aggregated data over hours and days will provide an insight into cost of BPF
and alarms can trigger in case given program suddenly gets more expensive.

The cost of two sched_clock() per program invocation adds ~20 nsec.
Fast BPF progs (like selftests/bpf/progs/test_pkt_access.c) will slow down
from ~10 nsec to ~30 nsec.
static_key minimizes the cost of the stats collection.
There is no measurable difference before/after this patch
with kernel.bpf_stats_enabled=0

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h    |  9 +++++++++
 include/linux/filter.h | 20 +++++++++++++++++++-
 kernel/bpf/core.c      | 31 +++++++++++++++++++++++++++++--
 kernel/bpf/syscall.c   | 34 ++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c  |  7 ++++++-
 kernel/sysctl.c        | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index de18227b3d95..a2132e09dc1c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -16,6 +16,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
 #include <linux/wait.h>
+#include <linux/u64_stats_sync.h>
 
 struct bpf_verifier_env;
 struct perf_event;
@@ -340,6 +341,12 @@ enum bpf_cgroup_storage_type {
 
 #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
 
+struct bpf_prog_stats {
+	u64 cnt;
+	u64 nsecs;
+	struct u64_stats_sync syncp;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -389,6 +396,7 @@ struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	struct bpf_prog_stats __percpu *stats;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -559,6 +567,7 @@ void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
 extern int sysctl_unprivileged_bpf_disabled;
+extern int sysctl_bpf_stats_enabled;
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f32b3eca5a04..7e5e3db11106 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -533,7 +533,24 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  ({ cant_sleep(); (*(filter)->bpf_func)(ctx, (filter)->insnsi); })
+DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+
+#define BPF_PROG_RUN(prog, ctx)	({				\
+	u32 ret;						\
+	cant_sleep();						\
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
+		struct bpf_prog_stats *stats;			\
+		u64 start = sched_clock();			\
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+		stats = this_cpu_ptr(prog->aux->stats);		\
+		u64_stats_update_begin(&stats->syncp);		\
+		stats->cnt++;					\
+		stats->nsecs += sched_clock() - start;		\
+		u64_stats_update_end(&stats->syncp);		\
+	} else {						\
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
+	}							\
+	ret; })
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
@@ -764,6 +781,7 @@ void bpf_prog_free_jited_linfo(struct bpf_prog *prog);
 void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags);
 void __bpf_prog_free(struct bpf_prog *fp);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ef88b167959d..1c14c347f3cf 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -78,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
 	return NULL;
 }
 
-struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
 {
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
 	struct bpf_prog_aux *aux;
@@ -104,6 +104,26 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 
 	return fp;
 }
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+	struct bpf_prog *prog;
+
+	prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
+	if (!prog)
+		return NULL;
+
+	prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+	if (!prog->aux->stats) {
+		kfree(prog->aux);
+		vfree(prog);
+		return NULL;
+	}
+
+	u64_stats_init(&prog->aux->stats->syncp);
+	return prog;
+}
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
 
 int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
@@ -231,7 +251,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 
 void __bpf_prog_free(struct bpf_prog *fp)
 {
-	kfree(fp->aux);
+	if (fp->aux) {
+		free_percpu(fp->aux->stats);
+		kfree(fp->aux);
+	}
 	vfree(fp);
 }
 
@@ -2069,6 +2092,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 	return -EFAULT;
 }
 
+DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+EXPORT_SYMBOL(bpf_stats_enabled_key);
+int sysctl_bpf_stats_enabled __read_mostly;
+
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
 #include <linux/bpf_trace.h>
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ec7c552af76b..31cf66fc3f5c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1283,24 +1283,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
+			       struct bpf_prog_stats *stats)
+{
+	u64 nsecs = 0, cnt = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		const struct bpf_prog_stats *st;
+		unsigned int start;
+		u64 tnsecs, tcnt;
+
+		st = per_cpu_ptr(prog->aux->stats, cpu);
+		do {
+			start = u64_stats_fetch_begin_irq(&st->syncp);
+			tnsecs = st->nsecs;
+			tcnt = st->cnt;
+		} while (u64_stats_fetch_retry_irq(&st->syncp, start));
+		nsecs += tnsecs;
+		cnt += tcnt;
+	}
+	stats->nsecs = nsecs;
+	stats->cnt = cnt;
+}
+
 #ifdef CONFIG_PROC_FS
 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 {
 	const struct bpf_prog *prog = filp->private_data;
 	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+	struct bpf_prog_stats stats;
 
+	bpf_prog_get_stats(prog, &stats);
 	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
 	seq_printf(m,
 		   "prog_type:\t%u\n"
 		   "prog_jited:\t%u\n"
 		   "prog_tag:\t%s\n"
 		   "memlock:\t%llu\n"
-		   "prog_id:\t%u\n",
+		   "prog_id:\t%u\n"
+		   "run_time_ns:\t%llu\n"
+		   "run_cnt:\t%llu\n",
 		   prog->type,
 		   prog->jited,
 		   prog_tag,
 		   prog->pages * 1ULL << PAGE_SHIFT,
-		   prog->aux->id);
+		   prog->aux->id,
+		   stats.nsecs,
+		   stats.cnt);
 }
 #endif
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1b9496c41383..0e4edd7e3c5f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7320,7 +7320,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		subprog_end = env->subprog_info[i + 1].start;
 
 		len = subprog_end - subprog_start;
-		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
+		/* BPF_PROG_RUN doesn't call subprogs directly,
+		 * hence main prog stats include the runtime of subprogs.
+		 * subprogs don't have IDs and not reachable via prog_get_next_id
+		 * func[i]->aux->stats will never be accessed and stays NULL
+		 */
+		func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
 		if (!func[i])
 			goto out_free;
 		memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba4d9e85feb8..86e0771352f2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -224,6 +224,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos);
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -1230,6 +1233,15 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "bpf_stats_enabled",
+		.data		= &sysctl_bpf_stats_enabled,
+		.maxlen		= sizeof(sysctl_bpf_stats_enabled),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax_bpf_stats,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
 	{
 		.procname	= "panic_on_rcu_stall",
@@ -3260,6 +3272,28 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_SYSCTL */
 
+static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	int ret, bpf_stats = *(int *)table->data;
+	struct ctl_table tmp = *table;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	tmp.data = &bpf_stats;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret) {
+		*(int *)table->data = bpf_stats;
+		if (bpf_stats)
+			static_branch_enable(&bpf_stats_enabled_key);
+		else
+			static_branch_disable(&bpf_stats_enabled_key);
+	}
+	return ret;
+}
+
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)
-- 
cgit v1.2.3-59-g8ed1b


From 5f8f8b93aeb8371c54af08bece2bd04bc2d48707 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 25 Feb 2019 14:28:40 -0800
Subject: bpf: expose program stats via bpf_prog_info

Return bpf program run_time_ns and run_cnt via bpf_prog_info

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bcdd2474eee7..2e308e90ffea 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2813,6 +2813,8 @@ struct bpf_prog_info {
 	__u32 jited_line_info_rec_size;
 	__u32 nr_prog_tags;
 	__aligned_u64 prog_tags;
+	__u64 run_time_ns;
+	__u64 run_cnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 31cf66fc3f5c..174581dfe225 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2152,6 +2152,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
 	struct bpf_prog_info info = {};
 	u32 info_len = attr->info.info_len;
+	struct bpf_prog_stats stats;
 	char __user *uinsns;
 	u32 ulen;
 	int err;
@@ -2191,6 +2192,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (err)
 		return err;
 
+	bpf_prog_get_stats(prog, &stats);
+	info.run_time_ns = stats.nsecs;
+	info.run_cnt = stats.cnt;
+
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
-- 
cgit v1.2.3-59-g8ed1b


From b1eca86db68b48f59b9f195c8b4f8114d2a9918c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 25 Feb 2019 14:28:41 -0800
Subject: tools/bpf: sync bpf.h into tools

sync bpf.h into tools directory

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bcdd2474eee7..2e308e90ffea 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2813,6 +2813,8 @@ struct bpf_prog_info {
 	__u32 jited_line_info_rec_size;
 	__u32 nr_prog_tags;
 	__aligned_u64 prog_tags;
+	__u64 run_time_ns;
+	__u64 run_cnt;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
cgit v1.2.3-59-g8ed1b


From 88ad472b8a4ad2292d11835652462fd9f745245e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 25 Feb 2019 14:28:42 -0800
Subject: tools/bpftool: recognize bpf_prog_info run_time_ns and run_cnt

$ bpftool p s
1: kprobe  tag a56587d488d216c9  gpl run_time_ns 79786 run_cnt 8
	loaded_at 2019-02-22T12:22:51-0800  uid 0
	xlated 352B  not jited  memlock 4096B

$ bpftool --json --pretty p s
[{
        "id": 1,
        "type": "kprobe",
        "tag": "a56587d488d216c9",
        "gpl_compatible": true,
        "run_time_ns": 79786,
        "run_cnt": 8,
        "loaded_at": 1550866971,
        "uid": 0,
        "bytes_xlated": 352,
        "jited": false,
        "bytes_memlock": 4096
    }
]

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-prog.rst | 4 +++-
 tools/bpf/bpftool/prog.c                         | 7 +++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 12bc1e2d4b46..9386bd6e0396 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -171,7 +171,7 @@ EXAMPLES
 
 ::
 
-    10: xdp  name some_prog  tag 005a3d2123620c8b  gpl
+    10: xdp  name some_prog  tag 005a3d2123620c8b  gpl run_time_ns 81632 run_cnt 10
             loaded_at 2017-09-29T20:11:00+0000  uid 0
             xlated 528B  jited 370B  memlock 4096B  map_ids 10
 
@@ -184,6 +184,8 @@ EXAMPLES
             "type": "xdp",
             "tag": "005a3d2123620c8b",
             "gpl_compatible": true,
+            "run_time_ns": 81632,
+            "run_cnt": 10,
             "loaded_at": 1506715860,
             "uid": 0,
             "bytes_xlated": 528,
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index db978c8d76a8..0c35dd543d49 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -214,6 +214,10 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
 		     info->tag[4], info->tag[5], info->tag[6], info->tag[7]);
 
 	jsonw_bool_field(json_wtr, "gpl_compatible", info->gpl_compatible);
+	if (info->run_time_ns) {
+		jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns);
+		jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt);
+	}
 
 	print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
 
@@ -277,6 +281,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
 	fprint_hex(stdout, info->tag, BPF_TAG_SIZE, "");
 	print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino);
 	printf("%s", info->gpl_compatible ? "  gpl" : "");
+	if (info->run_time_ns)
+		printf(" run_time_ns %lld run_cnt %lld",
+		       info->run_time_ns, info->run_cnt);
 	printf("\n");
 
 	if (info->load_time) {
-- 
cgit v1.2.3-59-g8ed1b


From d2e614cb0795d935aee879e47aab231247274f13 Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Wed, 27 Feb 2019 02:52:26 -0500
Subject: samples: bpf: fix: broken sample regarding removed function

Currently, running sample "task_fd_query" and "tracex3" occurs the
following error. On kernel v5.0-rc* this sample will be unavailable
due to the removal of function 'blk_start_request' at commit "a1ce35f".
(function removed, as "Single Queue IO scheduler" no longer exists)

$ sudo ./task_fd_query
failed to create kprobe 'blk_start_request' error 'No such file or
directory'

This commit will change the function 'blk_start_request' to
'blk_mq_start_request' to fix the broken sample.

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/task_fd_query_kern.c | 2 +-
 samples/bpf/task_fd_query_user.c | 2 +-
 samples/bpf/tracex3_kern.c       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
index f4b0a9ea674d..fb56fc2a3e5d 100644
--- a/samples/bpf/task_fd_query_kern.c
+++ b/samples/bpf/task_fd_query_kern.c
@@ -4,7 +4,7 @@
 #include <uapi/linux/bpf.h>
 #include "bpf_helpers.h"
 
-SEC("kprobe/blk_start_request")
+SEC("kprobe/blk_mq_start_request")
 int bpf_prog1(struct pt_regs *ctx)
 {
 	return 0;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
index 8381d792f138..aff2b4ae914e 100644
--- a/samples/bpf/task_fd_query_user.c
+++ b/samples/bpf/task_fd_query_user.c
@@ -311,7 +311,7 @@ int main(int argc, char **argv)
 	}
 
 	/* test two functions in the corresponding *_kern.c file */
-	CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request",
+	CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request",
 					   BPF_FD_TYPE_KPROBE));
 	CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
 					   BPF_FD_TYPE_KRETPROBE));
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
index 9974c3d7c18b..ea1d4c19c132 100644
--- a/samples/bpf/tracex3_kern.c
+++ b/samples/bpf/tracex3_kern.c
@@ -20,7 +20,7 @@ struct bpf_map_def SEC("maps") my_map = {
 /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
  * example will no longer be meaningful
  */
-SEC("kprobe/blk_start_request")
+SEC("kprobe/blk_mq_start_request")
 int bpf_prog1(struct pt_regs *ctx)
 {
 	long rq = PT_REGS_PARM1(ctx);
-- 
cgit v1.2.3-59-g8ed1b


From a115d0ed7201a5904c084ae6f07913fe2b9396a6 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 27 Feb 2019 13:22:56 -0800
Subject: bpf: set inner_map_meta->spin_lock_off correctly

Commit d83525ca62cf ("bpf: introduce bpf_spin_lock")
introduced bpf_spin_lock and the field spin_lock_off
in kernel internal structure bpf_map has the following
meaning:
  >=0 valid offset, <0 error

For every map created, the kernel will ensure
spin_lock_off has correct value.

Currently, bpf_map->spin_lock_off is not copied
from the inner map to the map_in_map inner_map_meta
during a map_in_map type map creation, so
inner_map_meta->spin_lock_off = 0.
This will give verifier wrong information that
inner_map has bpf_spin_lock and the bpf_spin_lock
is defined at offset 0. An access to offset 0
of a value pointer will trigger the following error:
   bpf_spin_lock cannot be accessed directly by load/store

This patch fixed the issue by copy inner map's spin_lock_off
value to inner_map_meta->spin_lock_off.

Fixes: d83525ca62cf ("bpf: introduce bpf_spin_lock")
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/map_in_map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 583346a0ab29..3dff41403583 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -58,6 +58,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	inner_map_meta->value_size = inner_map->value_size;
 	inner_map_meta->map_flags = inner_map->map_flags;
 	inner_map_meta->max_entries = inner_map->max_entries;
+	inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
 
 	/* Misc members not needed in bpf_map_meta_equal() check. */
 	inner_map_meta->ops = inner_map->ops;
-- 
cgit v1.2.3-59-g8ed1b


From 9eca5083757b679b37f210092c871916c2c222d0 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 27 Feb 2019 13:22:57 -0800
Subject: tools/bpf: selftests: add map lookup to test_map_in_map bpf prog

The bpf_map_lookup_elem is added in the bpf program.
Without previous patch, the test change will trigger the
following error:
  $ ./test_maps
  ...
  ; value_p = bpf_map_lookup_elem(map, &key);
  20: (bf) r1 = r7
  21: (bf) r2 = r8
  22: (85) call bpf_map_lookup_elem#1
  ; if (!value_p || *value_p != 123)
  23: (15) if r0 == 0x0 goto pc+16
   R0=map_value(id=2,off=0,ks=4,vs=4,imm=0) R6=inv1 R7=map_ptr(id=0,off=0,ks=4,vs=4,imm=0)
   R8=fp-8,call_-1 R10=fp0,call_-1 fp-8=mmmmmmmm
  ; if (!value_p || *value_p != 123)
  24: (61) r1 = *(u32 *)(r0 +0)
   R0=map_value(id=2,off=0,ks=4,vs=4,imm=0) R6=inv1 R7=map_ptr(id=0,off=0,ks=4,vs=4,imm=0)
   R8=fp-8,call_-1 R10=fp0,call_-1 fp-8=mmmmmmmm
  bpf_spin_lock cannot be accessed directly by load/store

With the kernel fix in the previous commit, the error goes away.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/test_map_in_map.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/test_map_in_map.c b/tools/testing/selftests/bpf/progs/test_map_in_map.c
index ce923e67e08e..2985f262846e 100644
--- a/tools/testing/selftests/bpf/progs/test_map_in_map.c
+++ b/tools/testing/selftests/bpf/progs/test_map_in_map.c
@@ -27,6 +27,7 @@ SEC("xdp_mimtest")
 int xdp_mimtest0(struct xdp_md *ctx)
 {
 	int value = 123;
+	int *value_p;
 	int key = 0;
 	void *map;
 
@@ -35,6 +36,9 @@ int xdp_mimtest0(struct xdp_md *ctx)
 		return XDP_DROP;
 
 	bpf_map_update_elem(map, &key, &value, 0);
+	value_p = bpf_map_lookup_elem(map, &key);
+	if (!value_p || *value_p != 123)
+		return XDP_DROP;
 
 	map = bpf_map_lookup_elem(&mim_hash, &key);
 	if (!map)
-- 
cgit v1.2.3-59-g8ed1b


From 3fcc5530bcb2a879e32bd940e6eafee328ac3647 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 27 Feb 2019 18:30:44 -0800
Subject: bpf: fix build without bpf_syscall

wrap bpf_stats_enabled sysctl with #ifdef

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: 492ecee892c2 ("bpf: enable program stats")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/sysctl.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 86e0771352f2..7578e21a711b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -224,9 +224,11 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #endif
 static int proc_dopipe_max_size(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_BPF_SYSCALL
 static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
 					  void __user *buffer, size_t *lenp,
 					  loff_t *ppos);
+#endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses its own private copy */
@@ -1232,7 +1234,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 		.extra2		= &one,
 	},
-#endif
 	{
 		.procname	= "bpf_stats_enabled",
 		.data		= &sysctl_bpf_stats_enabled,
@@ -1242,6 +1243,7 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#endif
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
 	{
 		.procname	= "panic_on_rcu_stall",
@@ -3272,6 +3274,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 
 #endif /* CONFIG_PROC_SYSCTL */
 
+#ifdef CONFIG_BPF_SYSCALL
 static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
 					  void __user *buffer, size_t *lenp,
 					  loff_t *ppos)
@@ -3293,7 +3296,7 @@ static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
 	}
 	return ret;
 }
-
+#endif
 /*
  * No sense putting this after each symbol definition, twice,
  * exception granted :-)
-- 
cgit v1.2.3-59-g8ed1b


From f2bb53887eb3e8f859ac7cfc09d1a3801492c009 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 27 Feb 2019 11:08:06 -0500
Subject: bpf: add missing entries to bpf_helpers.h

This header defines the BPF functions enumerated in uapi/linux.bpf.h
in a callable format. Expand to include all registered functions.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/bpf_helpers.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index d9999f1ed1d2..026bea831e03 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -232,6 +232,36 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) =
 	(void *) BPF_FUNC_skb_change_head;
 static int (*bpf_skb_pull_data)(void *, int len) =
 	(void *) BPF_FUNC_skb_pull_data;
+static unsigned int (*bpf_get_cgroup_classid)(void *ctx) =
+	(void *) BPF_FUNC_get_cgroup_classid;
+static unsigned int (*bpf_get_route_realm)(void *ctx) =
+	(void *) BPF_FUNC_get_route_realm;
+static int (*bpf_skb_change_proto)(void *ctx, __be16 proto, __u64 flags) =
+	(void *) BPF_FUNC_skb_change_proto;
+static int (*bpf_skb_change_type)(void *ctx, __u32 type) =
+	(void *) BPF_FUNC_skb_change_type;
+static unsigned int (*bpf_get_hash_recalc)(void *ctx) =
+	(void *) BPF_FUNC_get_hash_recalc;
+static unsigned long long (*bpf_get_current_task)(void *ctx) =
+	(void *) BPF_FUNC_get_current_task;
+static int (*bpf_skb_change_tail)(void *ctx, __u32 len, __u64 flags) =
+	(void *) BPF_FUNC_skb_change_tail;
+static long long (*bpf_csum_update)(void *ctx, __u32 csum) =
+	(void *) BPF_FUNC_csum_update;
+static void (*bpf_set_hash_invalid)(void *ctx) =
+	(void *) BPF_FUNC_set_hash_invalid;
+static int (*bpf_get_numa_node_id)(void) =
+	(void *) BPF_FUNC_get_numa_node_id;
+static int (*bpf_probe_read_str)(void *ctx, __u32 size,
+				 const void *unsafe_ptr) =
+	(void *) BPF_FUNC_probe_read_str;
+static unsigned int (*bpf_get_socket_uid)(void *ctx) =
+	(void *) BPF_FUNC_get_socket_uid;
+static unsigned int (*bpf_set_hash)(void *ctx, __u32 hash) =
+	(void *) BPF_FUNC_set_hash;
+static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode,
+				  unsigned long long flags) =
+	(void *) BPF_FUNC_skb_adjust_room;
 
 /* Scan the ARCH passed in from ARCH env variable (see Makefile) */
 #if defined(__TARGET_ARCH_x86)
-- 
cgit v1.2.3-59-g8ed1b


From ebace0e981b2aa6b2c0eb9bee0df3676fd690d8b Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 27 Feb 2019 11:15:11 -0800
Subject: selftests/bpf: use __bpf_constant_htons in test_prog.c for flow
 dissector

Older GCC (<4.8) isn't smart enough to optimize !__builtin_constant_p()
branch in bpf_htons.

I recently fixed it for pkt_v4 and pkt_v6 in commit a0517a0f7ef23
("selftests/bpf: use __bpf_constant_htons in test_prog.c"), but
later added another bunch of bpf_htons in commit bf0f0fd939451
("selftests/bpf: add simple BPF_PROG_TEST_RUN examples for flow
dissector").

Fixes: bf0f0fd939451 ("selftests/bpf: add simple BPF_PROG_TEST_RUN examples for flow dissector")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_progs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index c59d2e015d16..87cde42559f7 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1954,7 +1954,7 @@ static struct bpf_flow_keys pkt_v4_flow_keys = {
 	.thoff = sizeof(struct iphdr),
 	.addr_proto = ETH_P_IP,
 	.ip_proto = IPPROTO_TCP,
-	.n_proto = bpf_htons(ETH_P_IP),
+	.n_proto = __bpf_constant_htons(ETH_P_IP),
 };
 
 static struct bpf_flow_keys pkt_v6_flow_keys = {
@@ -1962,7 +1962,7 @@ static struct bpf_flow_keys pkt_v6_flow_keys = {
 	.thoff = sizeof(struct ipv6hdr),
 	.addr_proto = ETH_P_IPV6,
 	.ip_proto = IPPROTO_TCP,
-	.n_proto = bpf_htons(ETH_P_IPV6),
+	.n_proto = __bpf_constant_htons(ETH_P_IPV6),
 };
 
 static void test_flow_dissector(void)
-- 
cgit v1.2.3-59-g8ed1b


From 5c3cf87d477a461274452cb46f7654c5b6ae6294 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Feb 2019 19:04:10 -0800
Subject: samples: bpf: force IPv4 in ping

ping localhost may default of IPv6 on modern systems, but
samples are trying to only parse IPv4.  Force IPv4.

samples/bpf/tracex1_user.c doesn't interpret the packet so
we don't care which IP version will be used there.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/sock_example.c | 2 +-
 samples/bpf/sockex1_user.c | 2 +-
 samples/bpf/sockex2_user.c | 2 +-
 samples/bpf/sockex3_user.c | 2 +-
 samples/bpf/tracex2_user.c | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index 60ec467c78ab..00aae1d33fca 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -99,7 +99,7 @@ int main(void)
 {
 	FILE *f;
 
-	f = popen("ping -c5 localhost", "r");
+	f = popen("ping -4 -c5 localhost", "r");
 	(void)f;
 
 	return test_sock();
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
index 93ec01c56104..be8ba5686924 100644
--- a/samples/bpf/sockex1_user.c
+++ b/samples/bpf/sockex1_user.c
@@ -26,7 +26,7 @@ int main(int ac, char **argv)
 	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
 			  sizeof(prog_fd[0])) == 0);
 
-	f = popen("ping -c5 localhost", "r");
+	f = popen("ping -4 -c5 localhost", "r");
 	(void) f;
 
 	for (i = 0; i < 5; i++) {
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
index 1d5c6e9a6d27..125ee6efc913 100644
--- a/samples/bpf/sockex2_user.c
+++ b/samples/bpf/sockex2_user.c
@@ -34,7 +34,7 @@ int main(int ac, char **argv)
 	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
 			  sizeof(prog_fd[0])) == 0);
 
-	f = popen("ping -c5 localhost", "r");
+	f = popen("ping -4 -c5 localhost", "r");
 	(void) f;
 
 	for (i = 0; i < 5; i++) {
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index 9d02e0404719..bbb1cd0666a9 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -58,7 +58,7 @@ int main(int argc, char **argv)
 			  sizeof(__u32)) == 0);
 
 	if (argc > 1)
-		f = popen("ping -c5 localhost", "r");
+		f = popen("ping -4 -c5 localhost", "r");
 	else
 		f = popen("netperf -l 4 localhost", "r");
 	(void) f;
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index 1a81e6a5c2ea..c9544a4ce61a 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -131,7 +131,7 @@ int main(int ac, char **argv)
 	signal(SIGTERM, int_exit);
 
 	/* start 'ping' in the background to have some kfree_skb events */
-	f = popen("ping -c5 localhost", "r");
+	f = popen("ping -4 -c5 localhost", "r");
 	(void) f;
 
 	/* start 'dd' in the background to have plenty of 'write' syscalls */
-- 
cgit v1.2.3-59-g8ed1b


From ea9b6362018358a46008a8af339178469a4efe13 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Feb 2019 19:04:11 -0800
Subject: samples: bpf: remove load_sock_ops in favour of bpftool

bpftool can do all the things load_sock_ops used to do, and more.
Point users to bpftool instead of maintaining this sample utility.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/.gitignore             |  1 -
 samples/bpf/Makefile               |  2 -
 samples/bpf/load_sock_ops.c        | 97 --------------------------------------
 samples/bpf/tcp_basertt_kern.c     |  2 +-
 samples/bpf/tcp_bpf.readme         | 14 +++---
 samples/bpf/tcp_bufs_kern.c        |  2 +-
 samples/bpf/tcp_clamp_kern.c       |  2 +-
 samples/bpf/tcp_cong_kern.c        |  2 +-
 samples/bpf/tcp_iw_kern.c          |  2 +-
 samples/bpf/tcp_rwnd_kern.c        |  2 +-
 samples/bpf/tcp_synrto_kern.c      |  2 +-
 samples/bpf/tcp_tos_reflect_kern.c |  2 +-
 12 files changed, 16 insertions(+), 114 deletions(-)
 delete mode 100644 samples/bpf/load_sock_ops.c

diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore
index 8ae4940025f8..dbb817dbacfc 100644
--- a/samples/bpf/.gitignore
+++ b/samples/bpf/.gitignore
@@ -1,7 +1,6 @@
 cpustat
 fds_example
 lathist
-load_sock_ops
 lwt_len_hist
 map_perf_test
 offwaketime
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index a333e258f319..4dd98100678e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -40,7 +40,6 @@ hostprogs-y += lwt_len_hist
 hostprogs-y += xdp_tx_iptunnel
 hostprogs-y += test_map_in_map
 hostprogs-y += per_socket_stats_example
-hostprogs-y += load_sock_ops
 hostprogs-y += xdp_redirect
 hostprogs-y += xdp_redirect_map
 hostprogs-y += xdp_redirect_cpu
@@ -71,7 +70,6 @@ tracex4-objs := bpf_load.o tracex4_user.o
 tracex5-objs := bpf_load.o tracex5_user.o
 tracex6-objs := bpf_load.o tracex6_user.o
 tracex7-objs := bpf_load.o tracex7_user.o
-load_sock_ops-objs := bpf_load.o load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
 trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
 lathist-objs := bpf_load.o lathist_user.o
diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c
deleted file mode 100644
index 8ecb41ea0c03..000000000000
--- a/samples/bpf/load_sock_ops.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <linux/bpf.h>
-#include <bpf/bpf.h>
-#include "bpf_load.h"
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <linux/unistd.h>
-
-static void usage(char *pname)
-{
-	printf("USAGE:\n  %s [-l] <cg-path> <prog filename>\n", pname);
-	printf("\tLoad and attach a sock_ops program to the specified "
-	       "cgroup\n");
-	printf("\tIf \"-l\" is used, the program will continue to run\n");
-	printf("\tprinting the BPF log buffer\n");
-	printf("\tIf the specified filename does not end in \".o\", it\n");
-	printf("\tappends \"_kern.o\" to the name\n");
-	printf("\n");
-	printf("  %s -r <cg-path>\n", pname);
-	printf("\tDetaches the currently attached sock_ops program\n");
-	printf("\tfrom the specified cgroup\n");
-	printf("\n");
-	exit(1);
-}
-
-int main(int argc, char **argv)
-{
-	int logFlag = 0;
-	int error = 0;
-	char *cg_path;
-	char fn[500];
-	char *prog;
-	int cg_fd;
-
-	if (argc < 3)
-		usage(argv[0]);
-
-	if (!strcmp(argv[1], "-r")) {
-		cg_path = argv[2];
-		cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
-		error = bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
-		if (error) {
-			printf("ERROR: bpf_prog_detach: %d (%s)\n",
-			       error, strerror(errno));
-			return 2;
-		}
-		return 0;
-	} else if (!strcmp(argv[1], "-h")) {
-		usage(argv[0]);
-	} else if (!strcmp(argv[1], "-l")) {
-		logFlag = 1;
-		if (argc < 4)
-			usage(argv[0]);
-	}
-
-	prog = argv[argc - 1];
-	cg_path = argv[argc - 2];
-	if (strlen(prog) > 480) {
-		fprintf(stderr, "ERROR: program name too long (> 480 chars)\n");
-		return 3;
-	}
-	cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
-
-	if (!strcmp(prog + strlen(prog)-2, ".o"))
-		strcpy(fn, prog);
-	else
-		sprintf(fn, "%s_kern.o", prog);
-	if (logFlag)
-		printf("loading bpf file:%s\n", fn);
-	if (load_bpf_file(fn)) {
-		printf("ERROR: load_bpf_file failed for: %s\n", fn);
-		printf("%s", bpf_log_buf);
-		return 4;
-	}
-	if (logFlag)
-		printf("TCP BPF Loaded %s\n", fn);
-
-	error = bpf_prog_attach(prog_fd[0], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
-	if (error) {
-		printf("ERROR: bpf_prog_attach: %d (%s)\n",
-		       error, strerror(errno));
-		return 5;
-	} else if (logFlag) {
-		read_trace_pipe();
-	}
-
-	return error;
-}
diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c
index 4bf4fc597db9..6ef1625e8b2c 100644
--- a/samples/bpf/tcp_basertt_kern.c
+++ b/samples/bpf/tcp_basertt_kern.c
@@ -7,7 +7,7 @@
  * BPF program to set base_rtt to 80us when host is running TCP-NV and
  * both hosts are in the same datacenter (as determined by IPv6 prefix).
  *
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
  */
 
 #include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme
index 831fb601e3c9..fee746621aec 100644
--- a/samples/bpf/tcp_bpf.readme
+++ b/samples/bpf/tcp_bpf.readme
@@ -8,14 +8,16 @@ a cgroupv2 and attach a bash shell to the group.
   bash
   echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
 
-Anything that runs under this shell belongs to the foo cgroupv2 To load
+Anything that runs under this shell belongs to the foo cgroupv2. To load
 (attach) one of the tcp_*_kern.o programs:
 
-  ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o
+  bpftool prog load tcp_basertt_kern.o /sys/fs/bpf/tcp_prog
+  bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
+  bpftool prog tracelog
 
-If the "-l" flag is used, the load_sock_ops program will continue to run
-printing the BPF log buffer. The tcp_*_kern.o programs use special print
-functions to print logging information (if enabled by the ifdef).
+"bpftool prog tracelog" will continue to run printing the BPF log buffer.
+The tcp_*_kern.o programs use special print functions to print logging
+information (if enabled by the ifdef).
 
 If using netperf/netserver to create traffic, you need to run them under the
 cgroupv2 to which the BPF programs are attached (i.e. under bash shell
@@ -23,4 +25,4 @@ attached to the cgroupv2).
 
 To remove (unattach) a socket_ops BPF program from a cgroupv2:
 
-  ./load_sock_ops -r /tmp/cgroupv2/foo
+  bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c
index 0566b7fa38a1..e03e204739fa 100644
--- a/samples/bpf/tcp_bufs_kern.c
+++ b/samples/bpf/tcp_bufs_kern.c
@@ -9,7 +9,7 @@
  * doing appropriate checks that indicate the hosts are far enough
  * away (i.e. large RTT).
  *
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
  */
 
 #include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c
index f4225c9d2c0c..a0dc2d254aca 100644
--- a/samples/bpf/tcp_clamp_kern.c
+++ b/samples/bpf/tcp_clamp_kern.c
@@ -9,7 +9,7 @@
  * the same datacenter. For his example, we assume they are within the same
  * datacenter when the first 5.5 bytes of their IPv6 addresses are the same.
  *
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
  */
 
 #include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c
index ad0f1ba8206a..4fd3ca979a06 100644
--- a/samples/bpf/tcp_cong_kern.c
+++ b/samples/bpf/tcp_cong_kern.c
@@ -7,7 +7,7 @@
  * BPF program to set congestion control to dctcp when both hosts are
  * in the same datacenter (as deteremined by IPv6 prefix).
  *
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
  */
 
 #include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c
index 4ca5ecc9f580..9b139ec69560 100644
--- a/samples/bpf/tcp_iw_kern.c
+++ b/samples/bpf/tcp_iw_kern.c
@@ -9,7 +9,7 @@
  * would usually be done after doing appropriate checks that indicate
  * the hosts are far enough away (i.e. large RTT).
  *
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
  */
 
 #include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c
index 09ff65b40b31..cc71ee96e044 100644
--- a/samples/bpf/tcp_rwnd_kern.c
+++ b/samples/bpf/tcp_rwnd_kern.c
@@ -8,7 +8,7 @@
  * and the first 5.5 bytes of the IPv6 addresses are not the same (in this
  * example that means both hosts are not the same datacenter).
  *
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
  */
 
 #include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c
index 232bb242823e..ca87ed34f896 100644
--- a/samples/bpf/tcp_synrto_kern.c
+++ b/samples/bpf/tcp_synrto_kern.c
@@ -8,7 +8,7 @@
  * and the first 5.5 bytes of the IPv6 addresses are the same (in this example
  * that means both hosts are in the same datacenter).
  *
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
  */
 
 #include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c
index d51dab19eca6..de788be6f862 100644
--- a/samples/bpf/tcp_tos_reflect_kern.c
+++ b/samples/bpf/tcp_tos_reflect_kern.c
@@ -4,7 +4,7 @@
  *
  * BPF program to automatically reflect TOS option from received syn packet
  *
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
  */
 
 #include <uapi/linux/bpf.h>
-- 
cgit v1.2.3-59-g8ed1b


From f74a53d9a567f6bc6f6d8460e84c76bd2a45d016 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Feb 2019 19:04:12 -0800
Subject: tools: libbpf: add a correctly named define for map iteration

For historical reasons the helper to loop over maps in an object
is called bpf_map__for_each while it really should be called
bpf_object__for_each_map.  Rename and add a correctly named
define for backward compatibility.

Switch all in-tree users to the correct name (Quentin).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/prog.c                       | 4 ++--
 tools/lib/bpf/libbpf.c                         | 8 ++++----
 tools/lib/bpf/libbpf.h                         | 3 ++-
 tools/perf/util/bpf-loader.c                   | 4 ++--
 tools/testing/selftests/bpf/test_libbpf_open.c | 2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 0c35dd543d49..8ef80d65a474 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1053,7 +1053,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 	j = 0;
 	while (j < old_map_fds && map_replace[j].name) {
 		i = 0;
-		bpf_map__for_each(map, obj) {
+		bpf_object__for_each_map(map, obj) {
 			if (!strcmp(bpf_map__name(map), map_replace[j].name)) {
 				map_replace[j].idx = i;
 				break;
@@ -1074,7 +1074,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only)
 	/* Set ifindex and name reuse */
 	j = 0;
 	idx = 0;
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		if (!bpf_map__is_offload_neutral(map))
 			bpf_map__set_ifindex(map, ifindex);
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b38dcbe7460a..f5eb60379c8d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2100,7 +2100,7 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path)
 	if (err)
 		return err;
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		char buf[PATH_MAX];
 		int len;
 
@@ -2147,7 +2147,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path)
 	if (!obj)
 		return -ENOENT;
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		char buf[PATH_MAX];
 		int len;
 
@@ -2835,7 +2835,7 @@ bpf_object__find_map_by_name(struct bpf_object *obj, const char *name)
 {
 	struct bpf_map *pos;
 
-	bpf_map__for_each(pos, obj) {
+	bpf_object__for_each_map(pos, obj) {
 		if (pos->name && !strcmp(pos->name, name))
 			return pos;
 	}
@@ -2928,7 +2928,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 			first_prog = prog;
 	}
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		if (!bpf_map__is_offload_neutral(map))
 			map->map_ifindex = attr->ifindex;
 	}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 6c0168f8bba5..b4652aa1a58a 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -278,10 +278,11 @@ bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset);
 
 LIBBPF_API struct bpf_map *
 bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
-#define bpf_map__for_each(pos, obj)		\
+#define bpf_object__for_each_map(pos, obj)		\
 	for ((pos) = bpf_map__next(NULL, (obj));	\
 	     (pos) != NULL;				\
 	     (pos) = bpf_map__next((pos), (obj)))
+#define bpf_map__for_each bpf_object__for_each_map
 
 LIBBPF_API struct bpf_map *
 bpf_map__prev(struct bpf_map *map, struct bpf_object *obj);
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 037d8ff6a634..31b7e5a1453b 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -1489,7 +1489,7 @@ apply_obj_config_object(struct bpf_object *obj)
 	struct bpf_map *map;
 	int err;
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		err = apply_obj_config_map(map);
 		if (err)
 			return err;
@@ -1513,7 +1513,7 @@ int bpf__apply_obj_config(void)
 
 #define bpf__for_each_map(pos, obj, objtmp)	\
 	bpf_object__for_each_safe(obj, objtmp)	\
-		bpf_map__for_each(pos, obj)
+		bpf_object__for_each_map(pos, obj)
 
 #define bpf__for_each_map_named(pos, obj, objtmp, name)	\
 	bpf__for_each_map(pos, obj, objtmp) 		\
diff --git a/tools/testing/selftests/bpf/test_libbpf_open.c b/tools/testing/selftests/bpf/test_libbpf_open.c
index 1909ecf4d999..65cbd30704b5 100644
--- a/tools/testing/selftests/bpf/test_libbpf_open.c
+++ b/tools/testing/selftests/bpf/test_libbpf_open.c
@@ -67,7 +67,7 @@ int test_walk_maps(struct bpf_object *obj, bool verbose)
 	struct bpf_map *map;
 	int cnt = 0;
 
-	bpf_map__for_each(map, obj) {
+	bpf_object__for_each_map(map, obj) {
 		cnt++;
 		if (verbose)
 			printf("Map (count:%d) name: %s\n", cnt,
-- 
cgit v1.2.3-59-g8ed1b


From 1a9b268c90286cae99051353cb7dfb53ffd82676 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Feb 2019 19:04:13 -0800
Subject: samples: bpf: use libbpf where easy

Some samples don't really need the magic of bpf_load,
switch them to libbpf.

v2: - specify program types.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile       |  6 +++---
 samples/bpf/fds_example.c  | 10 +++++++---
 samples/bpf/sockex1_user.c | 23 +++++++++++++----------
 samples/bpf/sockex2_user.c | 21 ++++++++++++---------
 4 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4dd98100678e..0c62ac39c697 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -59,9 +59,9 @@ LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
 CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
 TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
 
-fds_example-objs := bpf_load.o fds_example.o
-sockex1-objs := bpf_load.o sockex1_user.o
-sockex2-objs := bpf_load.o sockex2_user.o
+fds_example-objs := fds_example.o
+sockex1-objs := sockex1_user.o
+sockex2-objs := sockex2_user.o
 sockex3-objs := bpf_load.o sockex3_user.o
 tracex1-objs := bpf_load.o tracex1_user.o
 tracex2-objs := bpf_load.o tracex2_user.o
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
index 9854854f05d1..e51eb060244e 100644
--- a/samples/bpf/fds_example.c
+++ b/samples/bpf/fds_example.c
@@ -14,8 +14,8 @@
 
 #include <bpf/bpf.h>
 
+#include "bpf/libbpf.h"
 #include "bpf_insn.h"
-#include "bpf_load.h"
 #include "sock_example.h"
 
 #define BPF_F_PIN	(1 << 0)
@@ -57,10 +57,14 @@ static int bpf_prog_create(const char *object)
 		BPF_EXIT_INSN(),
 	};
 	size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn);
+	char bpf_log_buf[BPF_LOG_BUF_SIZE];
+	struct bpf_object *obj;
+	int prog_fd;
 
 	if (object) {
-		assert(!load_bpf_file((char *)object));
-		return prog_fd[0];
+		assert(!bpf_prog_load(object, BPF_PROG_TYPE_UNSPEC,
+				      &obj, &prog_fd));
+		return prog_fd;
 	} else {
 		return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER,
 					insns, insns_cnt, "GPL", 0,
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
index be8ba5686924..7f90796ae15a 100644
--- a/samples/bpf/sockex1_user.c
+++ b/samples/bpf/sockex1_user.c
@@ -3,28 +3,31 @@
 #include <assert.h>
 #include <linux/bpf.h>
 #include <bpf/bpf.h>
-#include "bpf_load.h"
+#include "bpf/libbpf.h"
 #include "sock_example.h"
 #include <unistd.h>
 #include <arpa/inet.h>
 
 int main(int ac, char **argv)
 {
+	struct bpf_object *obj;
+	int map_fd, prog_fd;
 	char filename[256];
-	FILE *f;
 	int i, sock;
+	FILE *f;
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
-	if (load_bpf_file(filename)) {
-		printf("%s", bpf_log_buf);
+	if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER,
+			  &obj, &prog_fd))
 		return 1;
-	}
+
+	map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
 
 	sock = open_raw_sock("lo");
 
-	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
-			  sizeof(prog_fd[0])) == 0);
+	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+			  sizeof(prog_fd)) == 0);
 
 	f = popen("ping -4 -c5 localhost", "r");
 	(void) f;
@@ -34,13 +37,13 @@ int main(int ac, char **argv)
 		int key;
 
 		key = IPPROTO_TCP;
-		assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0);
+		assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
 
 		key = IPPROTO_UDP;
-		assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0);
+		assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0);
 
 		key = IPPROTO_ICMP;
-		assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);
+		assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0);
 
 		printf("TCP %lld UDP %lld ICMP %lld bytes\n",
 		       tcp_cnt, udp_cnt, icmp_cnt);
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
index 125ee6efc913..bc257333ad92 100644
--- a/samples/bpf/sockex2_user.c
+++ b/samples/bpf/sockex2_user.c
@@ -3,7 +3,7 @@
 #include <assert.h>
 #include <linux/bpf.h>
 #include <bpf/bpf.h>
-#include "bpf_load.h"
+#include "bpf/libbpf.h"
 #include "sock_example.h"
 #include <unistd.h>
 #include <arpa/inet.h>
@@ -17,22 +17,25 @@ struct pair {
 int main(int ac, char **argv)
 {
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	struct bpf_object *obj;
+	int map_fd, prog_fd;
 	char filename[256];
-	FILE *f;
 	int i, sock;
+	FILE *f;
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 	setrlimit(RLIMIT_MEMLOCK, &r);
 
-	if (load_bpf_file(filename)) {
-		printf("%s", bpf_log_buf);
+	if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER,
+			  &obj, &prog_fd))
 		return 1;
-	}
+
+	map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
 
 	sock = open_raw_sock("lo");
 
-	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
-			  sizeof(prog_fd[0])) == 0);
+	assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+			  sizeof(prog_fd)) == 0);
 
 	f = popen("ping -4 -c5 localhost", "r");
 	(void) f;
@@ -41,8 +44,8 @@ int main(int ac, char **argv)
 		int key = 0, next_key;
 		struct pair value;
 
-		while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
-			bpf_map_lookup_elem(map_fd[0], &next_key, &value);
+		while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
+			bpf_map_lookup_elem(map_fd, &next_key, &value);
 			printf("ip %s bytes %lld packets %lld\n",
 			       inet_ntoa((struct in_addr){htonl(next_key)}),
 			       value.bytes, value.packets);
-- 
cgit v1.2.3-59-g8ed1b


From 771744f9dc9742dc4259eb57f3a687d1630e1159 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Feb 2019 19:04:14 -0800
Subject: tools: libbpf: make sure readelf shows full names in build checks

readelf truncates its output by default to attempt to make it more
readable.  This can lead to function names getting aliased if they
differ late in the string.  Use --wide parameter to avoid
truncation.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 761691bd72ad..a05c43468bd0 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -132,9 +132,9 @@ BPF_IN    := $(OUTPUT)libbpf-in.o
 LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
 VERSION_SCRIPT := libbpf.map
 
-GLOBAL_SYM_COUNT = $(shell readelf -s $(BPF_IN) | \
+GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN) | \
 			   awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print s}')
-VERSIONED_SYM_COUNT = $(shell readelf -s $(OUTPUT)libbpf.so | \
+VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \
 			      grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l)
 
 CMD_TARGETS = $(LIB_FILE)
-- 
cgit v1.2.3-59-g8ed1b


From 3d8669e63751b7a3f8fac109cd350c5f6c14e3e1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 28 Feb 2019 21:06:47 +0300
Subject: tools/libbpf: signedness bug in btf_dedup_ref_type()

The "ref_type_id" variable needs to be signed for the error handling
to work.

Fixes: d5caef5b5655 ("btf: add BTF types deduplication algorithm")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/btf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 68b50e9bbde1..00a2f06e38fd 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -2326,7 +2326,8 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 	struct btf_type *t, *cand;
 	/* if we don't find equivalent type, then we are representative type */
 	__u32 new_id = type_id;
-	__u32 h, ref_type_id;
+	int ref_type_id;
+	__u32 h;
 
 	if (d->map[type_id] == BTF_IN_PROGRESS_ID)
 		return -ELOOP;
-- 
cgit v1.2.3-59-g8ed1b


From 8054d51f76c8da880115ffba6189b0efd7a5b5ba Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 15:31:21 -0800
Subject: selftests/bpf: fix btf_dedup testing code

btf_dedup testing code doesn't account for length of struct btf_header
when calculating the start of a string section. This patch fixes this
problem.

Fixes: 49b57e0d01db ("tools/bpf: remove btf__get_strings() superseded by raw data API")
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/.gitignore | 1 +
 tools/testing/selftests/bpf/test_btf.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index e47168d1257d..3b74d23fffab 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -14,6 +14,7 @@ feature
 test_libbpf_open
 test_sock
 test_sock_addr
+test_sock_fields
 urandom_read
 test_btf
 test_sockmap
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index 02d314383a9c..1426c0a905c8 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -5936,9 +5936,9 @@ static int do_test_dedup(unsigned int test_num)
 	}
 
 	test_hdr = test_btf_data;
-	test_strs = test_btf_data + test_hdr->str_off;
+	test_strs = test_btf_data + sizeof(*test_hdr) + test_hdr->str_off;
 	expect_hdr = expect_btf_data;
-	expect_strs = expect_btf_data + expect_hdr->str_off;
+	expect_strs = expect_btf_data + sizeof(*test_hdr) + expect_hdr->str_off;
 	if (CHECK(test_hdr->str_len != expect_hdr->str_len,
 		  "test_hdr->str_len:%u != expect_hdr->str_len:%u",
 		  test_hdr->str_len, expect_hdr->str_len)) {
-- 
cgit v1.2.3-59-g8ed1b


From 1baabdc1089eb807cdcabebad50b36c8b9895a48 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 15:31:22 -0800
Subject: libbpf: fix formatting for btf_ext__get_raw_data

Fix invalid formatting of pointer arg.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/btf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 94bbc249b0f1..b60bb7cf5fff 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -76,7 +76,7 @@ LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 
 LIBBPF_API struct btf_ext *btf_ext__new(__u8 *data, __u32 size);
 LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext);
-LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext* btf_ext,
+LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext,
 					     __u32 *size);
 LIBBPF_API int btf_ext__reloc_func_info(const struct btf *btf,
 					const struct btf_ext *btf_ext,
-- 
cgit v1.2.3-59-g8ed1b


From 51edf5f6e015c48b62e24ab2fbcad8885ca1c74e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 15:31:23 -0800
Subject: btf: allow to customize dedup hash table size

Default size of dedup table (16k) is good enough for most binaries, even
typical vmlinux images. But there are cases of binaries with huge amount
of BTF types (e.g., allyesconfig variants of kernel), which benefit from
having bigger dedup table size to lower amount of unnecessary hash
collisions. Tools like pahole, thus, can tune this parameter to reach
optimal performance.

This change also serves double purpose of allowing tests to force hash
collisions to test some corner cases, used in follow up patch.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/btf.c | 53 ++++++++++++++++++++++++++++++++++++-----------------
 tools/lib/bpf/btf.h |  1 +
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 00a2f06e38fd..820f7fc8ebcc 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1070,8 +1070,8 @@ done:
 	return err;
 }
 
-#define BTF_DEDUP_TABLE_SIZE_LOG 14
-#define BTF_DEDUP_TABLE_MOD ((1 << BTF_DEDUP_TABLE_SIZE_LOG) - 1)
+#define BTF_DEDUP_TABLE_DEFAULT_SIZE (1 << 14)
+#define BTF_DEDUP_TABLE_MAX_SIZE_LOG 31
 #define BTF_UNPROCESSED_ID ((__u32)-1)
 #define BTF_IN_PROGRESS_ID ((__u32)-2)
 
@@ -1128,18 +1128,21 @@ static inline __u32 hash_combine(__u32 h, __u32 value)
 #undef GOLDEN_RATIO_PRIME
 }
 
-#define for_each_hash_node(table, hash, node) \
-	for (node = table[hash & BTF_DEDUP_TABLE_MOD]; node; node = node->next)
+#define for_each_dedup_cand(d, hash, node) \
+	for (node = d->dedup_table[hash & (d->opts.dedup_table_size - 1)]; \
+	     node;							   \
+	     node = node->next)
 
 static int btf_dedup_table_add(struct btf_dedup *d, __u32 hash, __u32 type_id)
 {
 	struct btf_dedup_node *node = malloc(sizeof(struct btf_dedup_node));
+	int bucket = hash & (d->opts.dedup_table_size - 1);
 
 	if (!node)
 		return -ENOMEM;
 	node->type_id = type_id;
-	node->next = d->dedup_table[hash & BTF_DEDUP_TABLE_MOD];
-	d->dedup_table[hash & BTF_DEDUP_TABLE_MOD] = node;
+	node->next = d->dedup_table[bucket];
+	d->dedup_table[bucket] = node;
 	return 0;
 }
 
@@ -1177,7 +1180,7 @@ static void btf_dedup_table_free(struct btf_dedup *d)
 	if (!d->dedup_table)
 		return;
 
-	for (i = 0; i < (1 << BTF_DEDUP_TABLE_SIZE_LOG); i++) {
+	for (i = 0; i < d->opts.dedup_table_size; i++) {
 		while (d->dedup_table[i]) {
 			tmp = d->dedup_table[i];
 			d->dedup_table[i] = tmp->next;
@@ -1212,19 +1215,37 @@ static void btf_dedup_free(struct btf_dedup *d)
 	free(d);
 }
 
+/* Find closest power of two >= to size, capped at 2^max_size_log */
+static __u32 roundup_pow2_max(__u32 size, int max_size_log)
+{
+	int i;
+
+	for (i = 0; i < max_size_log  && (1U << i) < size;  i++)
+		;
+	return 1U << i;
+}
+
+
 static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
 				       const struct btf_dedup_opts *opts)
 {
 	struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup));
 	int i, err = 0;
+	__u32 sz;
 
 	if (!d)
 		return ERR_PTR(-ENOMEM);
 
+	d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds;
+	sz = opts && opts->dedup_table_size ? opts->dedup_table_size
+					    : BTF_DEDUP_TABLE_DEFAULT_SIZE;
+	sz = roundup_pow2_max(sz, BTF_DEDUP_TABLE_MAX_SIZE_LOG);
+	d->opts.dedup_table_size = sz;
+
 	d->btf = btf;
 	d->btf_ext = btf_ext;
 
-	d->dedup_table = calloc(1 << BTF_DEDUP_TABLE_SIZE_LOG,
+	d->dedup_table = calloc(d->opts.dedup_table_size,
 				sizeof(struct btf_dedup_node *));
 	if (!d->dedup_table) {
 		err = -ENOMEM;
@@ -1249,8 +1270,6 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
 	for (i = 0; i <= btf->nr_types; i++)
 		d->hypot_map[i] = BTF_UNPROCESSED_ID;
 
-	d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds;
-
 done:
 	if (err) {
 		btf_dedup_free(d);
@@ -1824,7 +1843,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 
 	case BTF_KIND_INT:
 		h = btf_hash_int(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_int(t, cand)) {
 				new_id = cand_node->type_id;
@@ -1835,7 +1854,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 
 	case BTF_KIND_ENUM:
 		h = btf_hash_enum(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_enum(t, cand)) {
 				new_id = cand_node->type_id;
@@ -1846,7 +1865,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 
 	case BTF_KIND_FWD:
 		h = btf_hash_common(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_common(t, cand)) {
 				new_id = cand_node->type_id;
@@ -2263,7 +2282,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
 		return 0;
 
 	h = btf_hash_struct(t);
-	for_each_hash_node(d->dedup_table, h, cand_node) {
+	for_each_dedup_cand(d, h, cand_node) {
 		int eq;
 
 		btf_dedup_clear_hypot_map(d);
@@ -2350,7 +2369,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 		t->type = ref_type_id;
 
 		h = btf_hash_common(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_common(t, cand)) {
 				new_id = cand_node->type_id;
@@ -2373,7 +2392,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 		info->index_type = ref_type_id;
 
 		h = btf_hash_array(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_array(t, cand)) {
 				new_id = cand_node->type_id;
@@ -2404,7 +2423,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 		}
 
 		h = btf_hash_fnproto(t);
-		for_each_hash_node(d->dedup_table, h, cand_node) {
+		for_each_dedup_cand(d, h, cand_node) {
 			cand = d->btf->types[cand_node->type_id];
 			if (btf_equal_fnproto(t, cand)) {
 				new_id = cand_node->type_id;
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index b60bb7cf5fff..28a1e1e59861 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -90,6 +90,7 @@ LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext);
 LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext);
 
 struct btf_dedup_opts {
+	unsigned int dedup_table_size;
 	bool dont_resolve_fwds;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 91097fbee4c025ac72f91ae41feba3a822cc1316 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 15:31:24 -0800
Subject: btf: fix bug with resolving STRUCT/UNION into corresponding FWD

When checking available canonical candidates for struct/union algorithm
utilizes btf_dedup_is_equiv to determine if candidate is suitable. This
check is not enough when candidate is corresponding FWD for that
struct/union, because according to equivalence logic they are
equivalent. When it so happens that FWD and STRUCT/UNION end in hashing
to the same bucket, it's possible to create remapping loop from FWD to
STRUCT and STRUCT to same FWD, which will cause btf_dedup() to loop
forever.

This patch fixes the issue by additionally checking that type and
canonical candidate are strictly equal (utilizing btf_equal_struct).

Fixes: d5caef5b5655 ("btf: add BTF types deduplication algorithm")
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/btf.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 820f7fc8ebcc..1b8d8cdd3575 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -1663,7 +1663,7 @@ static __u32 btf_hash_struct(struct btf_type *t)
  * IDs. This check is performed during type graph equivalence check and
  * referenced types equivalence is checked separately.
  */
-static bool btf_equal_struct(struct btf_type *t1, struct btf_type *t2)
+static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2)
 {
 	struct btf_member *m1, *m2;
 	__u16 vlen;
@@ -2124,7 +2124,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		struct btf_member *cand_m, *canon_m;
 		__u16 vlen;
 
-		if (!btf_equal_struct(cand_type, canon_type))
+		if (!btf_shallow_equal_struct(cand_type, canon_type))
 			return 0;
 		vlen = BTF_INFO_VLEN(cand_type->info);
 		cand_m = (struct btf_member *)(cand_type + 1);
@@ -2265,7 +2265,7 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d)
 static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
 {
 	struct btf_dedup_node *cand_node;
-	struct btf_type *t;
+	struct btf_type *cand_type, *t;
 	/* if we don't find equivalent type, then we are canonical */
 	__u32 new_id = type_id;
 	__u16 kind;
@@ -2285,6 +2285,20 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
 	for_each_dedup_cand(d, h, cand_node) {
 		int eq;
 
+		/*
+		 * Even though btf_dedup_is_equiv() checks for
+		 * btf_shallow_equal_struct() internally when checking two
+		 * structs (unions) for equivalence, we need to guard here
+		 * from picking matching FWD type as a dedup candidate.
+		 * This can happen due to hash collision. In such case just
+		 * relying on btf_dedup_is_equiv() would lead to potentially
+		 * creating a loop (FWD -> STRUCT and STRUCT -> FWD), because
+		 * FWD and compatible STRUCT/UNION are considered equivalent.
+		 */
+		cand_type = d->btf->types[cand_node->type_id];
+		if (!btf_shallow_equal_struct(t, cand_type))
+			continue;
+
 		btf_dedup_clear_hypot_map(d);
 		eq = btf_dedup_is_equiv(d, type_id, cand_node->type_id);
 		if (eq < 0)
-- 
cgit v1.2.3-59-g8ed1b


From 7c7a4890c87dd2eb77ef144e5153a7df4c0d6f53 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 15:31:25 -0800
Subject: selftests/bpf: add btf_dedup test of FWD/STRUCT resolution

This patch adds a btf_dedup test exercising logic of STRUCT<->FWD
resolution and validating that STRUCT is not resolved to a FWD. It also
forces hash collisions, forcing both FWD and STRUCT to be candidates for
each other. Previously this condition caused infinite loop due to FWD
pointing to STRUCT and STRUCT pointing to its FWD.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_btf.c | 45 ++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index 1426c0a905c8..38797aa627a7 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -5731,6 +5731,51 @@ const struct btf_dedup_test dedup_tests[] = {
 		.dont_resolve_fwds = false,
 	},
 },
+{
+	.descr = "dedup: struct <-> fwd resolution w/ hash collision",
+	/*
+	 * // CU 1:
+	 * struct x;
+	 * struct s {
+	 *	struct x *x;
+	 * };
+	 * // CU 2:
+	 * struct x {};
+	 * struct s {
+	 *	struct x *x;
+	 * };
+	 */
+	.input = {
+		.raw_types = {
+			/* CU 1 */
+			BTF_FWD_ENC(NAME_TBD, 0 /* struct fwd */),	/* [1] fwd x      */
+			BTF_PTR_ENC(1),					/* [2] ptr -> [1] */
+			BTF_STRUCT_ENC(NAME_TBD, 1, 8),			/* [3] struct s   */
+				BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+			/* CU 2 */
+			BTF_STRUCT_ENC(NAME_TBD, 0, 0),			/* [4] struct x   */
+			BTF_PTR_ENC(4),					/* [5] ptr -> [4] */
+			BTF_STRUCT_ENC(NAME_TBD, 1, 8),			/* [6] struct s   */
+				BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0x\0s\0x\0x\0s\0x\0"),
+	},
+	.expect = {
+		.raw_types = {
+			BTF_PTR_ENC(3),					/* [1] ptr -> [3] */
+			BTF_STRUCT_ENC(NAME_TBD, 1, 8),			/* [2] struct s   */
+				BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+			BTF_STRUCT_ENC(NAME_NTH(2), 0, 0),		/* [3] struct x   */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0s\0x"),
+	},
+	.opts = {
+		.dont_resolve_fwds = false,
+		.dedup_table_size = 1, /* force hash collisions */
+	},
+},
 {
 	.descr = "dedup: all possible kinds (no duplicates)",
 	.input = {
-- 
cgit v1.2.3-59-g8ed1b


From 4b9113045b1745ec8512d6743680809edca6a74e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 1 Mar 2019 14:33:11 -0800
Subject: bpf: fix u64_stats_init() usage in bpf_prog_alloc()

We need to iterate through all possible cpus.

Fixes: 492ecee892c2 ("bpf: enable program stats")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/core.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1c14c347f3cf..3f08c257858e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -109,6 +109,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 {
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
 	struct bpf_prog *prog;
+	int cpu;
 
 	prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
 	if (!prog)
@@ -121,7 +122,12 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 		return NULL;
 	}
 
-	u64_stats_init(&prog->aux->stats->syncp);
+	for_each_possible_cpu(cpu) {
+		struct bpf_prog_stats *pstats;
+
+		pstats = per_cpu_ptr(prog->aux->stats, cpu);
+		u64_stats_init(&pstats->syncp);
+	}
 	return prog;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_alloc);
-- 
cgit v1.2.3-59-g8ed1b


From 5efc529fb428e042c08a598b9afc5c5e2c600d74 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 17:12:19 -0800
Subject: docs/btf: fix typos, improve wording

Fix various typos, some of the formatting and wording for
Documentation/btf.rst.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/bpf/btf.rst | 108 +++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 55 deletions(-)

diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst
index 1d434c3a268d..1d761f1c5b2b 100644
--- a/Documentation/bpf/btf.rst
+++ b/Documentation/bpf/btf.rst
@@ -5,7 +5,7 @@ BPF Type Format (BTF)
 1. Introduction
 ***************
 
-BTF (BPF Type Format) is the meta data format which
+BTF (BPF Type Format) is the metadata format which
 encodes the debug info related to BPF program/map.
 The name BTF was used initially to describe
 data types. The BTF was later extended to include
@@ -40,8 +40,8 @@ details in :ref:`BTF_Type_String`.
 2. BTF Type and String Encoding
 *******************************
 
-The file ``include/uapi/linux/btf.h`` provides high
-level definition on how types/strings are encoded.
+The file ``include/uapi/linux/btf.h`` provides high-level
+definition of how types/strings are encoded.
 
 The beginning of data blob must be::
 
@@ -59,23 +59,23 @@ The beginning of data blob must be::
     };
 
 The magic is ``0xeB9F``, which has different encoding for big and little
-endian system, and can be used to test whether BTF is generated for
-big or little endian target.
-The btf_header is designed to be extensible with hdr_len equal to
-``sizeof(struct btf_header)`` when the data blob is generated.
+endian systems, and can be used to test whether BTF is generated for
+big- or little-endian target.
+The ``btf_header`` is designed to be extensible with ``hdr_len`` equal to
+``sizeof(struct btf_header)`` when a data blob is generated.
 
 2.1 String Encoding
 ===================
 
 The first string in the string section must be a null string.
-The rest of string table is a concatenation of other null-treminated
+The rest of string table is a concatenation of other null-terminated
 strings.
 
 2.2 Type Encoding
 =================
 
 The type id ``0`` is reserved for ``void`` type.
-The type section is parsed sequentially and the type id is assigned to
+The type section is parsed sequentially and type id is assigned to
 each recognized type starting from id ``1``.
 Currently, the following types are supported::
 
@@ -122,9 +122,9 @@ Each type contains the following common data::
         };
     };
 
-For certain kinds, the common data are followed by kind specific data.
-The ``name_off`` in ``struct btf_type`` specifies the offset in the string table.
-The following details encoding of each kind.
+For certain kinds, the common data are followed by kind-specific data.
+The ``name_off`` in ``struct btf_type`` specifies the offset in the string
+table. The following sections detail encoding of each kind.
 
 2.2.1 BTF_KIND_INT
 ~~~~~~~~~~~~~~~~~~
@@ -136,7 +136,7 @@ The following details encoding of each kind.
  * ``info.vlen``: 0
  * ``size``: the size of the int type in bytes.
 
-``btf_type`` is followed by a ``u32`` with following bits arrangement::
+``btf_type`` is followed by a ``u32`` with the following bits arrangement::
 
   #define BTF_INT_ENCODING(VAL)   (((VAL) & 0x0f000000) >> 24)
   #define BTF_INT_OFFSET(VAL)     (((VAL  & 0x00ff0000)) >> 16)
@@ -148,7 +148,7 @@ The ``BTF_INT_ENCODING`` has the following attributes::
   #define BTF_INT_CHAR    (1 << 1)
   #define BTF_INT_BOOL    (1 << 2)
 
-The ``BTF_INT_ENCODING()`` provides extra information, signness,
+The ``BTF_INT_ENCODING()`` provides extra information: signedness,
 char, or bool, for the int type. The char and bool encoding
 are mostly useful for pretty print. At most one encoding can
 be specified for the int type.
@@ -161,8 +161,7 @@ The maximum value of ``BTF_INT_BITS()`` is 128.
 
 The ``BTF_INT_OFFSET()`` specifies the starting bit offset to
 calculate values for this int. For example, a bitfield struct
-member has
-
+member has:
  * btf member bit offset 100 from the start of the structure,
  * btf member pointing to an int type,
  * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4``
@@ -179,7 +178,7 @@ access the same bits as the above:
 
 The original intention of ``BTF_INT_OFFSET()`` is to provide
 flexibility of bitfield encoding.
-Currently, both llvm and pahole generates ``BTF_INT_OFFSET() = 0``
+Currently, both llvm and pahole generate ``BTF_INT_OFFSET() = 0``
 for all int types.
 
 2.2.2 BTF_KIND_PTR
@@ -204,7 +203,7 @@ No additional type data follow ``btf_type``.
   * ``info.vlen``: 0
   * ``size/type``: 0, not used
 
-btf_type is followed by one "struct btf_array"::
+``btf_type`` is followed by one ``struct btf_array``::
 
     struct btf_array {
         __u32   type;
@@ -217,27 +216,26 @@ The ``struct btf_array`` encoding:
   * ``index_type``: the index type
   * ``nelems``: the number of elements for this array (``0`` is also allowed).
 
-The ``index_type`` can be any regular int types
-(u8, u16, u32, u64, unsigned __int128).
-The original design of including ``index_type`` follows dwarf
-which has a ``index_type`` for its array type.
+The ``index_type`` can be any regular int type
+(``u8``, ``u16``, ``u32``, ``u64``, ``unsigned __int128``).
+The original design of including ``index_type`` follows DWARF,
+which has an ``index_type`` for its array type.
 Currently in BTF, beyond type verification, the ``index_type`` is not used.
 
 The ``struct btf_array`` allows chaining through element type to represent
-multiple dimensional arrays. For example, ``int a[5][6]``, the following
-type system illustrates the chaining:
+multidimensional arrays. For example, for ``int a[5][6]``, the following
+type information illustrates the chaining:
 
   * [1]: int
   * [2]: array, ``btf_array.type = [1]``, ``btf_array.nelems = 6``
   * [3]: array, ``btf_array.type = [2]``, ``btf_array.nelems = 5``
 
-Currently, both pahole and llvm collapse multiple dimensional array
-into one dimensional array, e.g., ``a[5][6]``, the btf_array.nelems
-equal to ``30``. This is because the original use case is map pretty
-print where the whole array is dumped out so one dimensional array
+Currently, both pahole and llvm collapse multidimensional array
+into one-dimensional array, e.g., for ``a[5][6]``, the ``btf_array.nelems``
+is equal to ``30``. This is because the original use case is map pretty
+print where the whole array is dumped out so one-dimensional array
 is enough. As more BTF usage is explored, pahole and llvm can be
-changed to generate proper chained representation for
-multiple dimensional arrays.
+changed to generate proper chained representation for multidimensional arrays.
 
 2.2.4 BTF_KIND_STRUCT
 ~~~~~~~~~~~~~~~~~~~~~
@@ -382,7 +380,7 @@ No additional type data follow ``btf_type``.
 
 No additional type data follow ``btf_type``.
 
-A BTF_KIND_FUNC defines, not a type, but a subprogram (function) whose
+A BTF_KIND_FUNC defines not a type, but a subprogram (function) whose
 signature is defined by ``type``. The subprogram is thus an instance of
 that type. The BTF_KIND_FUNC may in turn be referenced by a func_info in
 the :ref:`BTF_Ext_Section` (ELF) or in the arguments to
@@ -459,10 +457,10 @@ The workflow typically looks like:
 3.1 BPF_BTF_LOAD
 ================
 
-Load a blob of BTF data into kernel. A blob of data
-described in :ref:`BTF_Type_String`
+Load a blob of BTF data into kernel. A blob of data,
+described in :ref:`BTF_Type_String`,
 can be directly loaded into the kernel.
-A ``btf_fd`` returns to userspace.
+A ``btf_fd`` is returned to a userspace.
 
 3.2 BPF_MAP_CREATE
 ==================
@@ -487,7 +485,7 @@ In libbpf, the map can be defined with extra annotation like below:
 Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name,
 key and value types for the map.
 During ELF parsing, libbpf is able to extract key/value type_id's
-and assigned them to BPF_MAP_CREATE attributes automatically.
+and assign them to BPF_MAP_CREATE attributes automatically.
 
 .. _BPF_Prog_Load:
 
@@ -532,7 +530,7 @@ Below are requirements for func_info:
     bpf func boundaries.
 
 Below are requirements for line_info:
-  * the first insn in each func must points to a line_info record.
+  * the first insn in each func must have a line_info record pointing to it.
   * the line_info insn_off is in strictly increasing order.
 
 For line_info, the line number and column number are defined as below:
@@ -544,26 +542,26 @@ For line_info, the line number and column number are defined as below:
 3.4 BPF_{PROG,MAP}_GET_NEXT_ID
 
 In kernel, every loaded program, map or btf has a unique id.
-The id won't change during the life time of the program, map or btf.
+The id won't change during the lifetime of a program, map, or btf.
 
 The bpf syscall command BPF_{PROG,MAP}_GET_NEXT_ID
 returns all id's, one for each command, to user space, for bpf
-program or maps,
-so the inspection tool can inspect all programs and maps.
+program or maps, respectively,
+so an inspection tool can inspect all programs and maps.
 
 3.5 BPF_{PROG,MAP}_GET_FD_BY_ID
 
-The introspection tool cannot use id to get details about program or maps.
-A file descriptor needs to be obtained first for reference counting purpose.
+An introspection tool cannot use id to get details about program or maps.
+A file descriptor needs to be obtained first for reference-counting purpose.
 
 3.6 BPF_OBJ_GET_INFO_BY_FD
 ==========================
 
-Once a program/map fd is acquired, the introspection tool can
+Once a program/map fd is acquired, an introspection tool can
 get the detailed information from kernel about this fd,
-some of which is btf related. For example,
-``bpf_map_info`` returns ``btf_id``, key/value type id.
-``bpf_prog_info`` returns ``btf_id``, func_info and line info
+some of which are BTF-related. For example,
+``bpf_map_info`` returns ``btf_id`` and key/value type ids.
+``bpf_prog_info`` returns ``btf_id``, func_info, and line info
 for translated bpf byte codes, and jited_line_info.
 
 3.7 BPF_BTF_GET_FD_BY_ID
@@ -574,9 +572,9 @@ bpf syscall command BPF_BTF_GET_FD_BY_ID can retrieve a btf fd.
 Then, with command BPF_OBJ_GET_INFO_BY_FD, the btf blob, originally
 loaded into the kernel with BPF_BTF_LOAD, can be retrieved.
 
-With the btf blob, ``bpf_map_info`` and ``bpf_prog_info``, the introspection
+With the btf blob, ``bpf_map_info``, and ``bpf_prog_info``, an introspection
 tool has full btf knowledge and is able to pretty print map key/values,
-dump func signatures, dump line info along with byte/jit codes.
+dump func signatures and line info, along with byte/jit codes.
 
 4. ELF File Format Interface
 ****************************
@@ -625,8 +623,8 @@ The func_info is organized as below.::
      ...
 
 ``func_info_rec_size`` specifies the size of ``bpf_func_info`` structure
-when .BTF.ext is generated. btf_ext_info_sec, defined below, is
-the func_info for each specific ELF section.::
+when .BTF.ext is generated. ``btf_ext_info_sec``, defined below, is
+a collection of func_info for each specific ELF section.::
 
      struct btf_ext_info_sec {
         __u32   sec_name_off; /* offset to section name */
@@ -661,7 +659,7 @@ from the beginning of section (``btf_ext_info_sec->sec_name_off``).
 
 With BTF, the map key/value can be printed based on fields rather than
 simply raw bytes. This is especially
-valuable for large structure or if you data structure
+valuable for large structure or if your data structure
 has bitfields. For example, for the following map,::
 
       enum A { A1, A2, A3, A4, A5 };
@@ -702,8 +700,8 @@ bpftool is able to pretty print like below:
 5.2 bpftool prog dump
 =====================
 
-The following is an example to show func_info and line_info
-can help prog dump with better kernel symbol name, function prototype
+The following is an example showing how func_info and line_info
+can help prog dump with better kernel symbol names, function prototypes
 and line information.::
 
     $ bpftool prog dump jited pinned /sys/fs/bpf/test_btf_haskv
@@ -733,10 +731,10 @@ and line information.::
     ; counts = bpf_map_lookup_elem(&btf_map, &key);
     [...]
 
-5.3 verifier log
+5.3 Verifier Log
 ================
 
-The following is an example how line_info can help verifier failure debug.::
+The following is an example of how line_info can help debugging verification failure.::
 
        /* The code at tools/testing/selftests/bpf/test_xdp_noinline.c
         * is modified as below.
@@ -867,4 +865,4 @@ The assembly code (-S) is able to show the BTF encoding in assembly format.::
 7. Testing
 **********
 
-Kernel bpf selftest `test_btf.c` provides extensive set of BTF related tests.
+Kernel bpf selftest `test_btf.c` provides extensive set of BTF-related tests.
-- 
cgit v1.2.3-59-g8ed1b


From 9ab5305dbe3ffcd146852e28aa76a917e45c7541 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 17:12:20 -0800
Subject: docs/btf: reflow text to fill up to 78 characters

Reflow paragraphs to more fully and evenly fill 78 character lines.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/bpf/btf.rst | 300 ++++++++++++++++++++++------------------------
 1 file changed, 140 insertions(+), 160 deletions(-)

diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst
index 1d761f1c5b2b..9a60a5d60e38 100644
--- a/Documentation/bpf/btf.rst
+++ b/Documentation/bpf/btf.rst
@@ -5,43 +5,35 @@ BPF Type Format (BTF)
 1. Introduction
 ***************
 
-BTF (BPF Type Format) is the metadata format which
-encodes the debug info related to BPF program/map.
-The name BTF was used initially to describe
-data types. The BTF was later extended to include
-function info for defined subroutines, and line info
-for source/line information.
-
-The debug info is used for map pretty print, function
-signature, etc. The function signature enables better
-bpf program/function kernel symbol.
-The line info helps generate
-source annotated translated byte code, jited code
-and verifier log.
+BTF (BPF Type Format) is the metadata format which encodes the debug info
+related to BPF program/map. The name BTF was used initially to describe data
+types. The BTF was later extended to include function info for defined
+subroutines, and line info for source/line information.
+
+The debug info is used for map pretty print, function signature, etc. The
+function signature enables better bpf program/function kernel symbol. The line
+info helps generate source annotated translated byte code, jited code and
+verifier log.
 
 The BTF specification contains two parts,
   * BTF kernel API
   * BTF ELF file format
 
-The kernel API is the contract between
-user space and kernel. The kernel verifies
-the BTF info before using it.
-The ELF file format is a user space contract
-between ELF file and libbpf loader.
+The kernel API is the contract between user space and kernel. The kernel
+verifies the BTF info before using it. The ELF file format is a user space
+contract between ELF file and libbpf loader.
 
-The type and string sections are part of the
-BTF kernel API, describing the debug info
-(mostly types related) referenced by the bpf program.
-These two sections are discussed in
-details in :ref:`BTF_Type_String`.
+The type and string sections are part of the BTF kernel API, describing the
+debug info (mostly types related) referenced by the bpf program. These two
+sections are discussed in details in :ref:`BTF_Type_String`.
 
 .. _BTF_Type_String:
 
 2. BTF Type and String Encoding
 *******************************
 
-The file ``include/uapi/linux/btf.h`` provides high-level
-definition of how types/strings are encoded.
+The file ``include/uapi/linux/btf.h`` provides high-level definition of how
+types/strings are encoded.
 
 The beginning of data blob must be::
 
@@ -59,25 +51,23 @@ The beginning of data blob must be::
     };
 
 The magic is ``0xeB9F``, which has different encoding for big and little
-endian systems, and can be used to test whether BTF is generated for
-big- or little-endian target.
-The ``btf_header`` is designed to be extensible with ``hdr_len`` equal to
-``sizeof(struct btf_header)`` when a data blob is generated.
+endian systems, and can be used to test whether BTF is generated for big- or
+little-endian target. The ``btf_header`` is designed to be extensible with
+``hdr_len`` equal to ``sizeof(struct btf_header)`` when a data blob is
+generated.
 
 2.1 String Encoding
 ===================
 
-The first string in the string section must be a null string.
-The rest of string table is a concatenation of other null-terminated
-strings.
+The first string in the string section must be a null string. The rest of
+string table is a concatenation of other null-terminated strings.
 
 2.2 Type Encoding
 =================
 
-The type id ``0`` is reserved for ``void`` type.
-The type section is parsed sequentially and type id is assigned to
-each recognized type starting from id ``1``.
-Currently, the following types are supported::
+The type id ``0`` is reserved for ``void`` type. The type section is parsed
+sequentially and type id is assigned to each recognized type starting from id
+``1``. Currently, the following types are supported::
 
     #define BTF_KIND_INT            1       /* Integer      */
     #define BTF_KIND_PTR            2       /* Pointer      */
@@ -122,9 +112,9 @@ Each type contains the following common data::
         };
     };
 
-For certain kinds, the common data are followed by kind-specific data.
-The ``name_off`` in ``struct btf_type`` specifies the offset in the string
-table. The following sections detail encoding of each kind.
+For certain kinds, the common data are followed by kind-specific data. The
+``name_off`` in ``struct btf_type`` specifies the offset in the string table.
+The following sections detail encoding of each kind.
 
 2.2.1 BTF_KIND_INT
 ~~~~~~~~~~~~~~~~~~
@@ -148,38 +138,33 @@ The ``BTF_INT_ENCODING`` has the following attributes::
   #define BTF_INT_CHAR    (1 << 1)
   #define BTF_INT_BOOL    (1 << 2)
 
-The ``BTF_INT_ENCODING()`` provides extra information: signedness,
-char, or bool, for the int type. The char and bool encoding
-are mostly useful for pretty print. At most one encoding can
-be specified for the int type.
-
-The ``BTF_INT_BITS()`` specifies the number of actual bits held by
-this int type. For example, a 4-bit bitfield encodes
-``BTF_INT_BITS()`` equals to 4. The ``btf_type.size * 8``
-must be equal to or greater than ``BTF_INT_BITS()`` for the type.
-The maximum value of ``BTF_INT_BITS()`` is 128.
-
-The ``BTF_INT_OFFSET()`` specifies the starting bit offset to
-calculate values for this int. For example, a bitfield struct
-member has:
- * btf member bit offset 100 from the start of the structure,
- * btf member pointing to an int type,
- * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4``
+The ``BTF_INT_ENCODING()`` provides extra information: signedness, char, or
+bool, for the int type. The char and bool encoding are mostly useful for
+pretty print. At most one encoding can be specified for the int type.
+
+The ``BTF_INT_BITS()`` specifies the number of actual bits held by this int
+type. For example, a 4-bit bitfield encodes ``BTF_INT_BITS()`` equals to 4.
+The ``btf_type.size * 8`` must be equal to or greater than ``BTF_INT_BITS()``
+for the type. The maximum value of ``BTF_INT_BITS()`` is 128.
+
+The ``BTF_INT_OFFSET()`` specifies the starting bit offset to calculate values
+for this int. For example, a bitfield struct member has: * btf member bit
+offset 100 from the start of the structure, * btf member pointing to an int
+type, * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4``
 
-Then in the struct memory layout, this member will occupy
-``4`` bits starting from bits ``100 + 2 = 102``.
+Then in the struct memory layout, this member will occupy ``4`` bits starting
+from bits ``100 + 2 = 102``.
 
-Alternatively, the bitfield struct member can be the following to
-access the same bits as the above:
+Alternatively, the bitfield struct member can be the following to access the
+same bits as the above:
 
  * btf member bit offset 102,
  * btf member pointing to an int type,
  * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4``
 
-The original intention of ``BTF_INT_OFFSET()`` is to provide
-flexibility of bitfield encoding.
-Currently, both llvm and pahole generate ``BTF_INT_OFFSET() = 0``
-for all int types.
+The original intention of ``BTF_INT_OFFSET()`` is to provide flexibility of
+bitfield encoding. Currently, both llvm and pahole generate
+``BTF_INT_OFFSET() = 0`` for all int types.
 
 2.2.2 BTF_KIND_PTR
 ~~~~~~~~~~~~~~~~~~
@@ -216,26 +201,25 @@ The ``struct btf_array`` encoding:
   * ``index_type``: the index type
   * ``nelems``: the number of elements for this array (``0`` is also allowed).
 
-The ``index_type`` can be any regular int type
-(``u8``, ``u16``, ``u32``, ``u64``, ``unsigned __int128``).
-The original design of including ``index_type`` follows DWARF,
-which has an ``index_type`` for its array type.
+The ``index_type`` can be any regular int type (``u8``, ``u16``, ``u32``,
+``u64``, ``unsigned __int128``). The original design of including
+``index_type`` follows DWARF, which has an ``index_type`` for its array type.
 Currently in BTF, beyond type verification, the ``index_type`` is not used.
 
 The ``struct btf_array`` allows chaining through element type to represent
-multidimensional arrays. For example, for ``int a[5][6]``, the following
-type information illustrates the chaining:
+multidimensional arrays. For example, for ``int a[5][6]``, the following type
+information illustrates the chaining:
 
   * [1]: int
   * [2]: array, ``btf_array.type = [1]``, ``btf_array.nelems = 6``
   * [3]: array, ``btf_array.type = [2]``, ``btf_array.nelems = 5``
 
-Currently, both pahole and llvm collapse multidimensional array
-into one-dimensional array, e.g., for ``a[5][6]``, the ``btf_array.nelems``
-is equal to ``30``. This is because the original use case is map pretty
-print where the whole array is dumped out so one-dimensional array
-is enough. As more BTF usage is explored, pahole and llvm can be
-changed to generate proper chained representation for multidimensional arrays.
+Currently, both pahole and llvm collapse multidimensional array into
+one-dimensional array, e.g., for ``a[5][6]``, the ``btf_array.nelems`` is
+equal to ``30``. This is because the original use case is map pretty print
+where the whole array is dumped out so one-dimensional array is enough. As
+more BTF usage is explored, pahole and llvm can be changed to generate proper
+chained representation for multidimensional arrays.
 
 2.2.4 BTF_KIND_STRUCT
 ~~~~~~~~~~~~~~~~~~~~~
@@ -262,28 +246,26 @@ changed to generate proper chained representation for multidimensional arrays.
   * ``type``: the member type
   * ``offset``: <see below>
 
-If the type info ``kind_flag`` is not set, the offset contains
-only bit offset of the member. Note that the base type of the
-bitfield can only be int or enum type. If the bitfield size
-is 32, the base type can be either int or enum type.
-If the bitfield size is not 32, the base type must be int,
-and int type ``BTF_INT_BITS()`` encodes the bitfield size.
+If the type info ``kind_flag`` is not set, the offset contains only bit offset
+of the member. Note that the base type of the bitfield can only be int or enum
+type. If the bitfield size is 32, the base type can be either int or enum
+type. If the bitfield size is not 32, the base type must be int, and int type
+``BTF_INT_BITS()`` encodes the bitfield size.
 
-If the ``kind_flag`` is set, the ``btf_member.offset``
-contains both member bitfield size and bit offset. The
-bitfield size and bit offset are calculated as below.::
+If the ``kind_flag`` is set, the ``btf_member.offset`` contains both member
+bitfield size and bit offset. The bitfield size and bit offset are calculated
+as below.::
 
   #define BTF_MEMBER_BITFIELD_SIZE(val)   ((val) >> 24)
   #define BTF_MEMBER_BIT_OFFSET(val)      ((val) & 0xffffff)
 
-In this case, if the base type is an int type, it must
-be a regular int type:
+In this case, if the base type is an int type, it must be a regular int type:
 
   * ``BTF_INT_OFFSET()`` must be 0.
   * ``BTF_INT_BITS()`` must be equal to ``{1,2,4,8,16} * 8``.
 
-The following kernel patch introduced ``kind_flag`` and
-explained why both modes exist:
+The following kernel patch introduced ``kind_flag`` and explained why both
+modes exist:
 
   https://github.com/torvalds/linux/commit/9d5f9f701b1891466fb3dbb1806ad97716f95cc3#diff-fa650a64fdd3968396883d2fe8215ff3
 
@@ -381,10 +363,10 @@ No additional type data follow ``btf_type``.
 No additional type data follow ``btf_type``.
 
 A BTF_KIND_FUNC defines not a type, but a subprogram (function) whose
-signature is defined by ``type``. The subprogram is thus an instance of
-that type. The BTF_KIND_FUNC may in turn be referenced by a func_info in
-the :ref:`BTF_Ext_Section` (ELF) or in the arguments to
-:ref:`BPF_Prog_Load` (ABI).
+signature is defined by ``type``. The subprogram is thus an instance of that
+type. The BTF_KIND_FUNC may in turn be referenced by a func_info in the
+:ref:`BTF_Ext_Section` (ELF) or in the arguments to :ref:`BPF_Prog_Load`
+(ABI).
 
 2.2.13 BTF_KIND_FUNC_PROTO
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -403,13 +385,13 @@ the :ref:`BTF_Ext_Section` (ELF) or in the arguments to
         __u32   type;
     };
 
-If a BTF_KIND_FUNC_PROTO type is referred by a BTF_KIND_FUNC type,
-then ``btf_param.name_off`` must point to a valid C identifier
-except for the possible last argument representing the variable
-argument. The btf_param.type refers to parameter type.
+If a BTF_KIND_FUNC_PROTO type is referred by a BTF_KIND_FUNC type, then
+``btf_param.name_off`` must point to a valid C identifier except for the
+possible last argument representing the variable argument. The btf_param.type
+refers to parameter type.
 
-If the function has variable arguments, the last parameter
-is encoded with ``name_off = 0`` and ``type = 0``.
+If the function has variable arguments, the last parameter is encoded with
+``name_off = 0`` and ``type = 0``.
 
 3. BTF Kernel API
 *****************
@@ -457,10 +439,9 @@ The workflow typically looks like:
 3.1 BPF_BTF_LOAD
 ================
 
-Load a blob of BTF data into kernel. A blob of data,
-described in :ref:`BTF_Type_String`,
-can be directly loaded into the kernel.
-A ``btf_fd`` is returned to a userspace.
+Load a blob of BTF data into kernel. A blob of data, described in
+:ref:`BTF_Type_String`, can be directly loaded into the kernel. A ``btf_fd``
+is returned to a userspace.
 
 3.2 BPF_MAP_CREATE
 ==================
@@ -482,18 +463,18 @@ In libbpf, the map can be defined with extra annotation like below:
     };
     BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts);
 
-Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name,
-key and value types for the map.
-During ELF parsing, libbpf is able to extract key/value type_id's
-and assign them to BPF_MAP_CREATE attributes automatically.
+Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name, key and
+value types for the map. During ELF parsing, libbpf is able to extract
+key/value type_id's and assign them to BPF_MAP_CREATE attributes
+automatically.
 
 .. _BPF_Prog_Load:
 
 3.3 BPF_PROG_LOAD
 =================
 
-During prog_load, func_info and line_info can be passed to kernel with
-proper values for the following attributes:
+During prog_load, func_info and line_info can be passed to kernel with proper
+values for the following attributes:
 ::
 
     __u32           insn_cnt;
@@ -520,9 +501,9 @@ The func_info and line_info are an array of below, respectively.::
         __u32   line_col; /* line number and column number */
     };
 
-func_info_rec_size is the size of each func_info record, and line_info_rec_size
-is the size of each line_info record. Passing the record size to kernel make
-it possible to extend the record itself in the future.
+func_info_rec_size is the size of each func_info record, and
+line_info_rec_size is the size of each line_info record. Passing the record
+size to kernel make it possible to extend the record itself in the future.
 
 Below are requirements for func_info:
   * func_info[0].insn_off must be 0.
@@ -541,13 +522,12 @@ For line_info, the line number and column number are defined as below:
 
 3.4 BPF_{PROG,MAP}_GET_NEXT_ID
 
-In kernel, every loaded program, map or btf has a unique id.
-The id won't change during the lifetime of a program, map, or btf.
+In kernel, every loaded program, map or btf has a unique id. The id won't
+change during the lifetime of a program, map, or btf.
 
-The bpf syscall command BPF_{PROG,MAP}_GET_NEXT_ID
-returns all id's, one for each command, to user space, for bpf
-program or maps, respectively,
-so an inspection tool can inspect all programs and maps.
+The bpf syscall command BPF_{PROG,MAP}_GET_NEXT_ID returns all id's, one for
+each command, to user space, for bpf program or maps, respectively, so an
+inspection tool can inspect all programs and maps.
 
 3.5 BPF_{PROG,MAP}_GET_FD_BY_ID
 
@@ -557,24 +537,23 @@ A file descriptor needs to be obtained first for reference-counting purpose.
 3.6 BPF_OBJ_GET_INFO_BY_FD
 ==========================
 
-Once a program/map fd is acquired, an introspection tool can
-get the detailed information from kernel about this fd,
-some of which are BTF-related. For example,
-``bpf_map_info`` returns ``btf_id`` and key/value type ids.
-``bpf_prog_info`` returns ``btf_id``, func_info, and line info
-for translated bpf byte codes, and jited_line_info.
+Once a program/map fd is acquired, an introspection tool can get the detailed
+information from kernel about this fd, some of which are BTF-related. For
+example, ``bpf_map_info`` returns ``btf_id`` and key/value type ids.
+``bpf_prog_info`` returns ``btf_id``, func_info, and line info for translated
+bpf byte codes, and jited_line_info.
 
 3.7 BPF_BTF_GET_FD_BY_ID
 ========================
 
-With ``btf_id`` obtained in ``bpf_map_info`` and ``bpf_prog_info``,
-bpf syscall command BPF_BTF_GET_FD_BY_ID can retrieve a btf fd.
-Then, with command BPF_OBJ_GET_INFO_BY_FD, the btf blob, originally
-loaded into the kernel with BPF_BTF_LOAD, can be retrieved.
+With ``btf_id`` obtained in ``bpf_map_info`` and ``bpf_prog_info``, bpf
+syscall command BPF_BTF_GET_FD_BY_ID can retrieve a btf fd. Then, with
+command BPF_OBJ_GET_INFO_BY_FD, the btf blob, originally loaded into the
+kernel with BPF_BTF_LOAD, can be retrieved.
 
 With the btf blob, ``bpf_map_info``, and ``bpf_prog_info``, an introspection
-tool has full btf knowledge and is able to pretty print map key/values,
-dump func signatures and line info, along with byte/jit codes.
+tool has full btf knowledge and is able to pretty print map key/values, dump
+func signatures and line info, along with byte/jit codes.
 
 4. ELF File Format Interface
 ****************************
@@ -582,19 +561,19 @@ dump func signatures and line info, along with byte/jit codes.
 4.1 .BTF section
 ================
 
-The .BTF section contains type and string data. The format of this section
-is same as the one describe in :ref:`BTF_Type_String`.
+The .BTF section contains type and string data. The format of this section is
+same as the one describe in :ref:`BTF_Type_String`.
 
 .. _BTF_Ext_Section:
 
 4.2 .BTF.ext section
 ====================
 
-The .BTF.ext section encodes func_info and line_info which
-needs loader manipulation before loading into the kernel.
+The .BTF.ext section encodes func_info and line_info which needs loader
+manipulation before loading into the kernel.
 
-The specification for .BTF.ext section is defined at
-``tools/lib/bpf/btf.h`` and ``tools/lib/bpf/btf.c``.
+The specification for .BTF.ext section is defined at ``tools/lib/bpf/btf.h``
+and ``tools/lib/bpf/btf.c``.
 
 The current header of .BTF.ext section::
 
@@ -611,9 +590,9 @@ The current header of .BTF.ext section::
         __u32   line_info_len;
     };
 
-It is very similar to .BTF section. Instead of type/string section,
-it contains func_info and line_info section. See :ref:`BPF_Prog_Load`
-for details about func_info and line_info record format.
+It is very similar to .BTF section. Instead of type/string section, it
+contains func_info and line_info section. See :ref:`BPF_Prog_Load` for details
+about func_info and line_info record format.
 
 The func_info is organized as below.::
 
@@ -622,9 +601,9 @@ The func_info is organized as below.::
      btf_ext_info_sec for section #2 /* func_info for section #2 */
      ...
 
-``func_info_rec_size`` specifies the size of ``bpf_func_info`` structure
-when .BTF.ext is generated. ``btf_ext_info_sec``, defined below, is
-a collection of func_info for each specific ELF section.::
+``func_info_rec_size`` specifies the size of ``bpf_func_info`` structure when
+.BTF.ext is generated. ``btf_ext_info_sec``, defined below, is a collection of
+func_info for each specific ELF section.::
 
      struct btf_ext_info_sec {
         __u32   sec_name_off; /* offset to section name */
@@ -642,14 +621,14 @@ The line_info is organized as below.::
      btf_ext_info_sec for section #2 /* line_info for section #2 */
      ...
 
-``line_info_rec_size`` specifies the size of ``bpf_line_info`` structure
-when .BTF.ext is generated.
+``line_info_rec_size`` specifies the size of ``bpf_line_info`` structure when
+.BTF.ext is generated.
 
 The interpretation of ``bpf_func_info->insn_off`` and
-``bpf_line_info->insn_off`` is different between kernel API and ELF API.
-For kernel API, the ``insn_off`` is the instruction offset in the unit
-of ``struct bpf_insn``. For ELF API, the ``insn_off`` is the byte offset
-from the beginning of section (``btf_ext_info_sec->sec_name_off``).
+``bpf_line_info->insn_off`` is different between kernel API and ELF API. For
+kernel API, the ``insn_off`` is the instruction offset in the unit of ``struct
+bpf_insn``. For ELF API, the ``insn_off`` is the byte offset from the
+beginning of section (``btf_ext_info_sec->sec_name_off``).
 
 5. Using BTF
 ************
@@ -657,10 +636,9 @@ from the beginning of section (``btf_ext_info_sec->sec_name_off``).
 5.1 bpftool map pretty print
 ============================
 
-With BTF, the map key/value can be printed based on fields rather than
-simply raw bytes. This is especially
-valuable for large structure or if your data structure
-has bitfields. For example, for the following map,::
+With BTF, the map key/value can be printed based on fields rather than simply
+raw bytes. This is especially valuable for large structure or if your data
+structure has bitfields. For example, for the following map,::
 
       enum A { A1, A2, A3, A4, A5 };
       typedef enum A ___A;
@@ -700,9 +678,9 @@ bpftool is able to pretty print like below:
 5.2 bpftool prog dump
 =====================
 
-The following is an example showing how func_info and line_info
-can help prog dump with better kernel symbol names, function prototypes
-and line information.::
+The following is an example showing how func_info and line_info can help prog
+dump with better kernel symbol names, function prototypes and line
+information.::
 
     $ bpftool prog dump jited pinned /sys/fs/bpf/test_btf_haskv
     [...]
@@ -734,7 +712,8 @@ and line information.::
 5.3 Verifier Log
 ================
 
-The following is an example of how line_info can help debugging verification failure.::
+The following is an example of how line_info can help debugging verification
+failure.::
 
        /* The code at tools/testing/selftests/bpf/test_xdp_noinline.c
         * is modified as below.
@@ -763,8 +742,8 @@ You need latest pahole
 
   https://git.kernel.org/pub/scm/devel/pahole/pahole.git/
 
-or llvm (8.0 or later). The pahole acts as a dwarf2btf converter. It doesn't support .BTF.ext
-and btf BTF_KIND_FUNC type yet. For example,::
+or llvm (8.0 or later). The pahole acts as a dwarf2btf converter. It doesn't
+support .BTF.ext and btf BTF_KIND_FUNC type yet. For example,::
 
       -bash-4.4$ cat t.c
       struct t {
@@ -781,8 +760,9 @@ and btf BTF_KIND_FUNC type yet. For example,::
               c type_id=2 bitfield_size=2 bits_offset=5
       [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED
 
-The llvm is able to generate .BTF and .BTF.ext directly with -g for bpf target only.
-The assembly code (-S) is able to show the BTF encoding in assembly format.::
+The llvm is able to generate .BTF and .BTF.ext directly with -g for bpf target
+only. The assembly code (-S) is able to show the BTF encoding in assembly
+format.::
 
     -bash-4.4$ cat t2.c
     typedef int __int32;
-- 
cgit v1.2.3-59-g8ed1b


From 46604676c8c6c4c07649767d32ae66f4429ccd6f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 28 Feb 2019 17:12:21 -0800
Subject: docs/bpf: minor casing/punctuation fixes

Fix few casing and punctuation glitches.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 Documentation/bpf/bpf_design_QA.rst | 24 ++++++++++++------------
 Documentation/networking/filter.txt |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
index 7cc9e368c1e9..10453c627135 100644
--- a/Documentation/bpf/bpf_design_QA.rst
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -36,27 +36,27 @@ consideration important quirks of other architectures) and
 defines calling convention that is compatible with C calling
 convention of the linux kernel on those architectures.
 
-Q: can multiple return values be supported in the future?
+Q: Can multiple return values be supported in the future?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: NO. BPF allows only register R0 to be used as return value.
 
-Q: can more than 5 function arguments be supported in the future?
+Q: Can more than 5 function arguments be supported in the future?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: NO. BPF calling convention only allows registers R1-R5 to be used
 as arguments. BPF is not a standalone instruction set.
 (unlike x64 ISA that allows msft, cdecl and other conventions)
 
-Q: can BPF programs access instruction pointer or return address?
+Q: Can BPF programs access instruction pointer or return address?
 -----------------------------------------------------------------
 A: NO.
 
-Q: can BPF programs access stack pointer ?
+Q: Can BPF programs access stack pointer ?
 ------------------------------------------
 A: NO.
 
 Only frame pointer (register R10) is accessible.
 From compiler point of view it's necessary to have stack pointer.
-For example LLVM defines register R11 as stack pointer in its
+For example, LLVM defines register R11 as stack pointer in its
 BPF backend, but it makes sure that generated code never uses it.
 
 Q: Does C-calling convention diminishes possible use cases?
@@ -66,8 +66,8 @@ A: YES.
 BPF design forces addition of major functionality in the form
 of kernel helper functions and kernel objects like BPF maps with
 seamless interoperability between them. It lets kernel call into
-BPF programs and programs call kernel helpers with zero overhead.
-As all of them were native C code. That is particularly the case
+BPF programs and programs call kernel helpers with zero overhead,
+as all of them were native C code. That is particularly the case
 for JITed BPF programs that are indistinguishable from
 native kernel C code.
 
@@ -75,9 +75,9 @@ Q: Does it mean that 'innovative' extensions to BPF code are disallowed?
 ------------------------------------------------------------------------
 A: Soft yes.
 
-At least for now until BPF core has support for
+At least for now, until BPF core has support for
 bpf-to-bpf calls, indirect calls, loops, global variables,
-jump tables, read only sections and all other normal constructs
+jump tables, read-only sections, and all other normal constructs
 that C code can produce.
 
 Q: Can loops be supported in a safe way?
@@ -109,16 +109,16 @@ For example why BPF_JNE and other compare and jumps are not cpu-like?
 A: This was necessary to avoid introducing flags into ISA which are
 impossible to make generic and efficient across CPU architectures.
 
-Q: why BPF_DIV instruction doesn't map to x64 div?
+Q: Why BPF_DIV instruction doesn't map to x64 div?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: Because if we picked one-to-one relationship to x64 it would have made
 it more complicated to support on arm64 and other archs. Also it
 needs div-by-zero runtime check.
 
-Q: why there is no BPF_SDIV for signed divide operation?
+Q: Why there is no BPF_SDIV for signed divide operation?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A: Because it would be rarely used. llvm errors in such case and
-prints a suggestion to use unsigned divide instead
+prints a suggestion to use unsigned divide instead.
 
 Q: Why BPF has implicit prologue and epilogue?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index b5e060edfc38..319e5e041f38 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -829,7 +829,7 @@ tracing filters may do to maintain counters of events, for example. Register R9
 is not used by socket filters either, but more complex filters may be running
 out of registers and would have to resort to spill/fill to stack.
 
-Internal BPF can used as generic assembler for last step performance
+Internal BPF can be used as a generic assembler for last step performance
 optimizations, socket filters and seccomp are using it as assembler. Tracing
 filters may use it as assembler to generate code from kernel. In kernel usage
 may not be bounded by security considerations, since generated internal BPF code
-- 
cgit v1.2.3-59-g8ed1b


From a83de90658280611933d6b3e5044726f63a953f2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 28 Feb 2019 22:18:16 -0800
Subject: selftests/bpf: set unlimited RLIMIT_MEMLOCK for test_sock_fields

This is to avoid permission denied error. A lot of systems
may have a much lower number, e.g., 64KB, for RLIMIT_MEMLOCK,
which may not be sufficient for the test to run successfully.

Fixes: e0b27b3f97b8 ("bpf: Add test_sock_fields for skb->sk and bpf_tcp_sock")
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_sock_fields.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/test_sock_fields.c b/tools/testing/selftests/bpf/test_sock_fields.c
index 9bb58369b481..bc8943938bf5 100644
--- a/tools/testing/selftests/bpf/test_sock_fields.c
+++ b/tools/testing/selftests/bpf/test_sock_fields.c
@@ -14,6 +14,7 @@
 #include <bpf/libbpf.h>
 
 #include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
 
 enum bpf_array_idx {
 	SRV_IDX,
-- 
cgit v1.2.3-59-g8ed1b


From b74e21ab7d438117a10d2d331bdfc275fcab2970 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 28 Feb 2019 22:19:41 -0800
Subject: samples/bpf: silence compiler warning for xdpsock_user.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling xdpsock_user.c with 4.8.5, I hit the following
compilation warning:
    HOSTCC  samples/bpf/xdpsock_user.o
  /data/users/yhs/work/net-next/samples/bpf/xdpsock_user.c: In function ‘main’:
  /data/users/yhs/work/net-next/samples/bpf/xdpsock_user.c:449:6: warning: ‘idx_cq’ may be used unini
  tialized in this function [-Wmaybe-uninitialized]
    u32 idx_cq, idx_fq;
        ^
  /data/users/yhs/work/net-next/samples/bpf/xdpsock_user.c:606:7: warning: ‘idx_rx’ may be used unini
  tialized in this function [-Wmaybe-uninitialized]
     u32 idx_rx, idx_tx = 0;
         ^
  /data/users/yhs/work/net-next/samples/bpf/xdpsock_user.c:506:6: warning: ‘idx_rx’ may be used unini
  tialized in this function [-Wmaybe-uninitialized]
    u32 idx_rx, idx_fq = 0;

As an example, the code pattern looks like:
    u32 idx_cq;
    ...
    ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
    if (ret) {
      ...
    }
    ... idx_fq ...
The compiler warns since it does not know whether &idx_fq is assigned
or not inside the library function xsk_ring_prod__reserve().

Let us assign an initial value 0 to such auto variables to silence
compiler warning.

Fixes: 248c7f9c0e21 ("samples/bpf: convert xdpsock to use libbpf for AF_XDP access")
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdpsock_user.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 9c76d6d43deb..d08ee1ab7bb4 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -446,7 +446,7 @@ static void kick_tx(struct xsk_socket_info *xsk)
 
 static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
 {
-	u32 idx_cq, idx_fq;
+	u32 idx_cq = 0, idx_fq = 0;
 	unsigned int rcvd;
 	size_t ndescs;
 
@@ -503,7 +503,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
 static void rx_drop(struct xsk_socket_info *xsk)
 {
 	unsigned int rcvd, i;
-	u32 idx_rx, idx_fq = 0;
+	u32 idx_rx = 0, idx_fq = 0;
 	int ret;
 
 	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
@@ -603,7 +603,7 @@ static void l2fwd(struct xsk_socket_info *xsk)
 {
 	for (;;) {
 		unsigned int rcvd, i;
-		u32 idx_rx, idx_tx = 0;
+		u32 idx_rx = 0, idx_tx = 0;
 		int ret;
 
 		for (;;) {
-- 
cgit v1.2.3-59-g8ed1b


From f7c917ba11a67632a8452ea99fe132f626a7a2cc Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:46 -0800
Subject: bpf: add bpf helper bpf_skb_ecn_set_ce

This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce
"int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to
BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
be attached to the ingress and egress path. The helper is needed
because his type of bpf_prog cannot modify the skb directly.

This helper is used to set the ECN field of ECN capable IP packets to ce
(congestion encountered) in the IPv6 or IPv4 header of the skb. It can be
used by a bpf_prog to manage egress or ingress network bandwdith limit
per cgroupv2 by inducing an ECN response in the TCP sender.
This works best when using DCTCP.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 net/core/filter.c        | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e308e90ffea..3c38ac9a92a7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2359,6 +2359,13 @@ union bpf_attr {
  *	Return
  *		A **struct bpf_tcp_sock** pointer on success, or NULL in
  *		case of failure.
+ *
+ * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
+ *     Description
+ *             Sets ECN of IP header to ce (congestion encountered) if
+ *             current value is ect (ECN capable). Works with IPv6 and IPv4.
+ *     Return
+ *             1 if set, 0 if not set.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2457,7 +2464,8 @@ union bpf_attr {
 	FN(spin_lock),			\
 	FN(spin_unlock),		\
 	FN(sk_fullsock),		\
-	FN(tcp_sock),
+	FN(tcp_sock),			\
+	FN(skb_ecn_set_ce),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 85749f6ec789..558ca72f2254 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = {
 	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
 };
 
+BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
+{
+	unsigned int iphdr_len;
+
+	if (skb->protocol == cpu_to_be16(ETH_P_IP))
+		iphdr_len = sizeof(struct iphdr);
+	else if (skb->protocol == cpu_to_be16(ETH_P_IPV6))
+		iphdr_len = sizeof(struct ipv6hdr);
+	else
+		return 0;
+
+	if (skb_headlen(skb) < iphdr_len)
+		return 0;
+
+	if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
+		return 0;
+
+	return INET_ECN_set_ce(skb);
+}
+
+static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
+	.func           = bpf_skb_ecn_set_ce,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_INET
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
+	case BPF_FUNC_skb_ecn_set_ce:
+		return &bpf_skb_ecn_set_ce_proto;
 #endif
 	default:
 		return sk_filter_func_proto(func_id, prog);
-- 
cgit v1.2.3-59-g8ed1b


From 5cce85c640ccc9d9aab8b05c77d7d076a44d4db2 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:47 -0800
Subject: bpf: sync bpf.h to tools and update bpf_helpers.h

This patch syncs the uapi bpf.h to tools/ and also updates
bpf_herlpers.h in tools/

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h            | 10 +++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2e308e90ffea..3c38ac9a92a7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2359,6 +2359,13 @@ union bpf_attr {
  *	Return
  *		A **struct bpf_tcp_sock** pointer on success, or NULL in
  *		case of failure.
+ *
+ * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
+ *     Description
+ *             Sets ECN of IP header to ce (congestion encountered) if
+ *             current value is ect (ECN capable). Works with IPv6 and IPv4.
+ *     Return
+ *             1 if set, 0 if not set.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2457,7 +2464,8 @@ union bpf_attr {
 	FN(spin_lock),			\
 	FN(spin_unlock),		\
 	FN(sk_fullsock),		\
-	FN(tcp_sock),
+	FN(tcp_sock),			\
+	FN(skb_ecn_set_ce),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 026bea831e03..c9433a496d54 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -180,6 +180,8 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) =
 	(void *) BPF_FUNC_sk_fullsock;
 static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) =
 	(void *) BPF_FUNC_tcp_sock;
+static int (*bpf_skb_ecn_set_ce)(void *ctx) =
+	(void *) BPF_FUNC_skb_ecn_set_ce;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3-59-g8ed1b


From 187d0738ff351f725a58be3d606d3a7fc8db8aed Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:48 -0800
Subject: bpf: Sample HBM BPF program to limit egress bw

A cgroup skb BPF program to limit cgroup output bandwidth.
It uses a modified virtual token bucket queue to limit average
egress bandwidth. The implementation uses credits instead of tokens.
Negative credits imply that queueing would have happened (this is
a virtual queue, so no queueing is done by it. However, queueing may
occur at the actual qdisc (which is not used for rate limiting).

This implementation uses 3 thresholds, one to start marking packets and
the other two to drop packets:
                                 CREDIT
       - <--------------------------|------------------------> +
             |    |          |      0
             |  Large pkt    |
             |  drop thresh  |
  Small pkt drop             Mark threshold
      thresh

The effect of marking depends on the type of packet:
a) If the packet is ECN enabled, then the packet is ECN ce marked.
   The current mark threshold is tuned for DCTCP.
c) Else, it is dropped if it is a large packet.

If the credit is below the drop threshold, the packet is dropped.
Note that dropping a packet through the BPF program does not trigger CWR
(Congestion Window Reduction) in TCP packets. A future patch will add
support for triggering CWR.

This BPF program actually uses 2 drop thresholds, one threshold
for larger packets (>= 120 bytes) and another for smaller packets. This
protects smaller packets such as SYNs, ACKs, etc.

The default bandwidth limit is set at 1Gbps but this can be changed by
a user program through a shared BPF map. In addition, by default this BPF
program does not limit connections using loopback. This behavior can be
overwritten by the user program. There is also an option to calculate
some statistics, such as percent of packets marked or dropped, which
the user program can access.

A latter patch provides such a program (hbm.c)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile       |   2 +
 samples/bpf/hbm.h          |  31 +++++++++
 samples/bpf/hbm_kern.h     | 137 +++++++++++++++++++++++++++++++++++++++
 samples/bpf/hbm_out_kern.c | 157 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 327 insertions(+)
 create mode 100644 samples/bpf/hbm.h
 create mode 100644 samples/bpf/hbm_kern.h
 create mode 100644 samples/bpf/hbm_out_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 0c62ac39c697..e1bdc96486f6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -164,6 +164,7 @@ always += xdp_adjust_tail_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
 always += xdp_sample_pkts_kern.o
+always += hbm_out_kern.o
 
 KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -263,6 +264,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
 $(src)/*.c: verify_target_bpf $(LIBBPF)
 
 $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
+$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
 
 # asm/sysreg.h - inline assembly used by it is incompatible with llvm.
 # But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
new file mode 100644
index 000000000000..518e8147d084
--- /dev/null
+++ b/samples/bpf/hbm.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for Host Bandwidth Management (HBM) programs
+ */
+struct hbm_vqueue {
+	struct bpf_spin_lock lock;
+	/* 4 byte hole */
+	unsigned long long lasttime;	/* In ns */
+	int credit;			/* In bytes */
+	unsigned int rate;		/* In bytes per NS << 20 */
+};
+
+struct hbm_queue_stats {
+	unsigned long rate;		/* in Mbps*/
+	unsigned long stats:1,		/* get HBM stats (marked, dropped,..) */
+		loopback:1;		/* also limit flows using loopback */
+	unsigned long long pkts_marked;
+	unsigned long long bytes_marked;
+	unsigned long long pkts_dropped;
+	unsigned long long bytes_dropped;
+	unsigned long long pkts_total;
+	unsigned long long bytes_total;
+	unsigned long long firstPacketTime;
+	unsigned long long lastPacketTime;
+};
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
new file mode 100644
index 000000000000..c5635d924193
--- /dev/null
+++ b/samples/bpf/hbm_kern.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for sample Host Bandwidth Manager (HBM) BPF programs
+ */
+#define KBUILD_MODNAME "foo"
+#include <stddef.h>
+#include <stdbool.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/filter.h>
+#include <uapi/linux/pkt_cls.h>
+#include <net/ipv6.h>
+#include <net/inet_ecn.h>
+#include "bpf_endian.h"
+#include "bpf_helpers.h"
+#include "hbm.h"
+
+#define DROP_PKT	0
+#define ALLOW_PKT	1
+#define TCP_ECN_OK	1
+
+#define HBM_DEBUG 0  // Set to 1 to enable debugging
+#if HBM_DEBUG
+#define bpf_printk(fmt, ...)					\
+({								\
+	char ____fmt[] = fmt;					\
+	bpf_trace_printk(____fmt, sizeof(____fmt),		\
+			 ##__VA_ARGS__);			\
+})
+#else
+#define bpf_printk(fmt, ...)
+#endif
+
+#define INITIAL_CREDIT_PACKETS	100
+#define MAX_BYTES_PER_PACKET	1500
+#define MARK_THRESH		(40 * MAX_BYTES_PER_PACKET)
+#define DROP_THRESH		(80 * 5 * MAX_BYTES_PER_PACKET)
+#define LARGE_PKT_DROP_THRESH	(DROP_THRESH - (15 * MAX_BYTES_PER_PACKET))
+#define MARK_REGION_SIZE	(LARGE_PKT_DROP_THRESH - MARK_THRESH)
+#define LARGE_PKT_THRESH	120
+#define MAX_CREDIT		(100 * MAX_BYTES_PER_PACKET)
+#define INIT_CREDIT		(INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
+
+// rate in bytes per ns << 20
+#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+
+struct bpf_map_def SEC("maps") queue_state = {
+	.type = BPF_MAP_TYPE_CGROUP_STORAGE,
+	.key_size = sizeof(struct bpf_cgroup_storage_key),
+	.value_size = sizeof(struct hbm_vqueue),
+};
+BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key,
+		     struct hbm_vqueue);
+
+struct bpf_map_def SEC("maps") queue_stats = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(struct hbm_queue_stats),
+	.max_entries = 1,
+};
+BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
+
+struct hbm_pkt_info {
+	bool	is_ip;
+	bool	is_tcp;
+	short	ecn;
+};
+
+static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
+					     struct hbm_pkt_info *pkti)
+{
+	struct iphdr iph;
+	struct ipv6hdr *ip6h;
+
+	bpf_skb_load_bytes(skb, 0, &iph, 12);
+	if (iph.version == 6) {
+		ip6h = (struct ipv6hdr *)&iph;
+		pkti->is_ip = true;
+		pkti->is_tcp = (ip6h->nexthdr == 6);
+		pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK;
+	} else if (iph.version == 4) {
+		pkti->is_ip = true;
+		pkti->is_tcp = (iph.protocol == 6);
+		pkti->ecn = iph.tos & INET_ECN_MASK;
+	} else {
+		pkti->is_ip = false;
+		pkti->is_tcp = false;
+		pkti->ecn = 0;
+	}
+}
+
+static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
+{
+		bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+		qdp->lasttime = bpf_ktime_get_ns();
+		qdp->credit = INIT_CREDIT;
+		qdp->rate = rate * 128;
+}
+
+static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
+					     int len,
+					     unsigned long long curtime,
+					     bool congestion_flag,
+					     bool drop_flag)
+{
+	if (qsp != NULL) {
+		// Following is needed for work conserving
+		__sync_add_and_fetch(&(qsp->bytes_total), len);
+		if (qsp->stats) {
+			// Optionally update statistics
+			if (qsp->firstPacketTime == 0)
+				qsp->firstPacketTime = curtime;
+			qsp->lastPacketTime = curtime;
+			__sync_add_and_fetch(&(qsp->pkts_total), 1);
+			if (congestion_flag || drop_flag) {
+				__sync_add_and_fetch(&(qsp->pkts_marked), 1);
+				__sync_add_and_fetch(&(qsp->bytes_marked), len);
+			}
+			if (drop_flag) {
+				__sync_add_and_fetch(&(qsp->pkts_dropped), 1);
+				__sync_add_and_fetch(&(qsp->bytes_dropped),
+						     len);
+			}
+		}
+	}
+}
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
new file mode 100644
index 000000000000..f806863d0b79
--- /dev/null
+++ b/samples/bpf/hbm_out_kern.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ *                                  CREDIT
+ *        - <--------------------------|------------------------> +
+ *              |    |          |      0
+ *              |  Large pkt    |
+ *              |  drop thresh  |
+ *   Small pkt drop             Mark threshold
+ *       thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ *    is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ *    to reduce the congestion window. The current implementation uses a linear
+ *    distribution (0% probability at marking threshold, 100% probability
+ *    at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * by a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * the user program can access.
+ *
+ * A latter patch provides such a program (hbm.c)
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+	struct hbm_pkt_info pkti;
+	int len = skb->len;
+	unsigned int queue_index = 0;
+	unsigned long long curtime;
+	int credit;
+	signed long long delta = 0, zero = 0;
+	int max_credit = MAX_CREDIT;
+	bool congestion_flag = false;
+	bool drop_flag = false;
+	bool cwr_flag = false;
+	struct hbm_vqueue *qdp;
+	struct hbm_queue_stats *qsp = NULL;
+	int rv = ALLOW_PKT;
+
+	qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+	if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+		return ALLOW_PKT;
+
+	hbm_get_pkt_info(skb, &pkti);
+
+	// We may want to account for the length of headers in len
+	// calculation, like ETH header + overhead, specially if it
+	// is a gso packet. But I am not doing it right now.
+
+	qdp = bpf_get_local_storage(&queue_state, 0);
+	if (!qdp)
+		return ALLOW_PKT;
+	else if (qdp->lasttime == 0)
+		hbm_init_vqueue(qdp, 1024);
+
+	curtime = bpf_ktime_get_ns();
+
+	// Begin critical section
+	bpf_spin_lock(&qdp->lock);
+	credit = qdp->credit;
+	delta = curtime - qdp->lasttime;
+	/* delta < 0 implies that another process with a curtime greater
+	 * than ours beat us to the critical section and already added
+	 * the new credit, so we should not add it ourselves
+	 */
+	if (delta > 0) {
+		qdp->lasttime = curtime;
+		credit += CREDIT_PER_NS(delta, qdp->rate);
+		if (credit > MAX_CREDIT)
+			credit = MAX_CREDIT;
+	}
+	credit -= len;
+	qdp->credit = credit;
+	bpf_spin_unlock(&qdp->lock);
+	// End critical section
+
+	// Check if we should update rate
+	if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
+		qdp->rate = qsp->rate * 128;
+		bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
+			   (int)qdp->rate,
+			   CREDIT_PER_NS(1000000000, qdp->rate) * 8);
+	}
+
+	// Set flags (drop, congestion, cwr)
+	// Dropping => we are congested, so ignore congestion flag
+	if (credit < -DROP_THRESH ||
+	    (len > LARGE_PKT_THRESH &&
+	     credit < -LARGE_PKT_DROP_THRESH)) {
+		// Very congested, set drop flag
+		drop_flag = true;
+	} else if (credit < 0) {
+		// Congested, set congestion flag
+		if (pkti.ecn) {
+			if (credit < -MARK_THRESH)
+				congestion_flag = true;
+			else
+				congestion_flag = false;
+		} else {
+			congestion_flag = true;
+		}
+	}
+
+	if (congestion_flag) {
+		if (!bpf_skb_ecn_set_ce(skb)) {
+			if (len > LARGE_PKT_THRESH) {
+				// Problem if too many small packets?
+				drop_flag = true;
+			}
+		}
+	}
+
+	if (drop_flag)
+		rv = DROP_PKT;
+
+	hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
+
+	if (rv == DROP_PKT)
+		__sync_add_and_fetch(&(qdp->credit), len);
+
+	return rv;
+}
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3-59-g8ed1b


From a1270fe95b74eb3195b107c494ed1f11b932a278 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:49 -0800
Subject: bpf: User program for testing HBM

The program nrm creates a cgroup and attaches a BPF program to the
cgroup for testing HBM (Host Bandwidth Manager) for egress traffic.
One still needs to create network traffic. This can be done through
netesto, netperf or iperf3.
A follow-up patch contains a script to create traffic.

USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>]
           [-w] [-h] [prog]
  Where:
   -d        Print BPF trace debug buffer
   -l        Also limit flows doing loopback
   -n <#>    To create cgroup "/hbm#" and attach prog. Default is /nrm1
             This is convenient when testing HBM in more than 1 cgroup
   -r <rate> Rate limit in Mbps
   -s        Get HBM stats (marked, dropped, etc.)
   -t <time> Exit after specified seconds (deault is 0)
   -w        Work conserving flag. cgroup can increase its bandwidth
             beyond the rate limit specified while there is available
             bandwidth. Current implementation assumes there is only
             NIC (eth0), but can be extended to support multiple NICs.
             Currrently only supported for egress. Note, this is just
	     a proof of concept.
   -h        Print this info
   prog      BPF program file name. Name defaults to hbm_out_kern.o

More information about HBM can be found in the paper "BPF Host Resource
Management" presented at the 2018 Linux Plumbers Conference, Networking Track
(http://vger.kernel.org/lpc_net2018_talks/LPC%20BPF%20Network%20Resource%20Paper.pdf)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile |   3 +
 samples/bpf/hbm.c    | 441 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 444 insertions(+)
 create mode 100644 samples/bpf/hbm.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index e1bdc96486f6..65e667bdf979 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -52,6 +52,7 @@ hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 hostprogs-y += task_fd_query
 hostprogs-y += xdp_sample_pkts
+hostprogs-y += hbm
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -107,6 +108,7 @@ xdpsock-objs := xdpsock_user.o
 xdp_fwd-objs := xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
+hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -265,6 +267,7 @@ $(src)/*.c: verify_target_bpf $(LIBBPF)
 
 $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
 $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
+$(obj)/hbm.o: $(src)/hbm.h
 
 # asm/sysreg.h - inline assembly used by it is incompatible with llvm.
 # But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
new file mode 100644
index 000000000000..8408ccb7409f
--- /dev/null
+++ b/samples/bpf/hbm.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Example program for Host Bandwidth Managment
+ *
+ * This program loads a cgroup skb BPF program to enforce cgroup output
+ * (egress) or input (ingress) bandwidth limits.
+ *
+ * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
+ *   Where:
+ *    -d	Print BPF trace debug buffer
+ *    -l	Also limit flows doing loopback
+ *    -n <#>	To create cgroup \"/hbm#\" and attach prog
+ *		Default is /hbm1
+ *    -r <rate>	Rate limit in Mbps
+ *    -s	Get HBM stats (marked, dropped, etc.)
+ *    -t <time>	Exit after specified seconds (deault is 0)
+ *    -w	Work conserving flag. cgroup can increase its bandwidth
+ *		beyond the rate limit specified while there is available
+ *		bandwidth. Current implementation assumes there is only
+ *		NIC (eth0), but can be extended to support multiple NICs.
+ *		Currrently only supported for egress.
+ *    -h	Print this info
+ *    prog	BPF program file name. Name defaults to hbm_out_kern.o
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_load.h"
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+#include "hbm.h"
+#include "bpf_util.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
+
+bool outFlag = true;
+int minRate = 1000;		/* cgroup rate limit in Mbps */
+int rate = 1000;		/* can grow if rate conserving is enabled */
+int dur = 1;
+bool stats_flag;
+bool loopback_flag;
+bool debugFlag;
+bool work_conserving_flag;
+
+static void Usage(void);
+static void read_trace_pipe2(void);
+static void do_error(char *msg, bool errno_flag);
+
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
+struct bpf_object *obj;
+int bpfprog_fd;
+int cgroup_storage_fd;
+
+static void read_trace_pipe2(void)
+{
+	int trace_fd;
+	FILE *outf;
+	char *outFname = "hbm_out.log";
+
+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	if (trace_fd < 0) {
+		printf("Error opening trace_pipe\n");
+		return;
+	}
+
+//	Future support of ingress
+//	if (!outFlag)
+//		outFname = "hbm_in.log";
+	outf = fopen(outFname, "w");
+
+	if (outf == NULL)
+		printf("Error creating %s\n", outFname);
+
+	while (1) {
+		static char buf[4097];
+		ssize_t sz;
+
+		sz = read(trace_fd, buf, sizeof(buf) - 1);
+		if (sz > 0) {
+			buf[sz] = 0;
+			puts(buf);
+			if (outf != NULL) {
+				fprintf(outf, "%s\n", buf);
+				fflush(outf);
+			}
+		}
+	}
+}
+
+static void do_error(char *msg, bool errno_flag)
+{
+	if (errno_flag)
+		printf("ERROR: %s, errno: %d\n", msg, errno);
+	else
+		printf("ERROR: %s\n", msg);
+	exit(1);
+}
+
+static int prog_load(char *prog)
+{
+	struct bpf_prog_load_attr prog_load_attr = {
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+		.file = prog,
+		.expected_attach_type = BPF_CGROUP_INET_EGRESS,
+	};
+	int map_fd;
+	struct bpf_map *map;
+
+	int ret = 0;
+
+	if (access(prog, O_RDONLY) < 0) {
+		printf("Error accessing file %s: %s\n", prog, strerror(errno));
+		return 1;
+	}
+	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd))
+		ret = 1;
+	if (!ret) {
+		map = bpf_object__find_map_by_name(obj, "queue_stats");
+		map_fd = bpf_map__fd(map);
+		if (map_fd < 0) {
+			printf("Map not found: %s\n", strerror(map_fd));
+			ret = 1;
+		}
+	}
+
+	if (ret) {
+		printf("ERROR: load_bpf_file failed for: %s\n", prog);
+		printf("  Output from verifier:\n%s\n------\n", bpf_log_buf);
+		ret = -1;
+	} else {
+		ret = map_fd;
+	}
+
+	return ret;
+}
+
+static int run_bpf_prog(char *prog, int cg_id)
+{
+	int map_fd;
+	int rc = 0;
+	int key = 0;
+	int cg1 = 0;
+	int type = BPF_CGROUP_INET_EGRESS;
+	char cg_dir[100];
+	struct hbm_queue_stats qstats = {0};
+
+	sprintf(cg_dir, "/hbm%d", cg_id);
+	map_fd = prog_load(prog);
+	if (map_fd  == -1)
+		return 1;
+
+	if (setup_cgroup_environment()) {
+		printf("ERROR: setting cgroup environment\n");
+		goto err;
+	}
+	cg1 = create_and_get_cgroup(cg_dir);
+	if (!cg1) {
+		printf("ERROR: create_and_get_cgroup\n");
+		goto err;
+	}
+	if (join_cgroup(cg_dir)) {
+		printf("ERROR: join_cgroup\n");
+		goto err;
+	}
+
+	qstats.rate = rate;
+	qstats.stats = stats_flag ? 1 : 0;
+	qstats.loopback = loopback_flag ? 1 : 0;
+	if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
+		printf("ERROR: Could not update map element\n");
+		goto err;
+	}
+
+	if (!outFlag)
+		type = BPF_CGROUP_INET_INGRESS;
+	if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) {
+		printf("ERROR: bpf_prog_attach fails!\n");
+		log_err("Attaching prog");
+		goto err;
+	}
+
+	if (work_conserving_flag) {
+		struct timeval t0, t_last, t_new;
+		FILE *fin;
+		unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
+		signed long long last_cg_tx_bytes, new_cg_tx_bytes;
+		signed long long delta_time, delta_bytes, delta_rate;
+		int delta_ms;
+#define DELTA_RATE_CHECK 10000		/* in us */
+#define RATE_THRESHOLD 9500000000	/* 9.5 Gbps */
+
+		bpf_map_lookup_elem(map_fd, &key, &qstats);
+		if (gettimeofday(&t0, NULL) < 0)
+			do_error("gettimeofday failed", true);
+		t_last = t0;
+		fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
+		if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
+			do_error("fscanf fails", false);
+		fclose(fin);
+		last_cg_tx_bytes = qstats.bytes_total;
+		while (true) {
+			usleep(DELTA_RATE_CHECK);
+			if (gettimeofday(&t_new, NULL) < 0)
+				do_error("gettimeofday failed", true);
+			delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
+				(t_new.tv_usec - t0.tv_usec)/1000;
+			if (delta_ms > dur * 1000)
+				break;
+			delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
+				(t_new.tv_usec - t_last.tv_usec);
+			if (delta_time == 0)
+				continue;
+			t_last = t_new;
+			fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
+				    "r");
+			if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
+				do_error("fscanf fails", false);
+			fclose(fin);
+			printf("  new_eth_tx_bytes:%llu\n",
+			       new_eth_tx_bytes);
+			bpf_map_lookup_elem(map_fd, &key, &qstats);
+			new_cg_tx_bytes = qstats.bytes_total;
+			delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
+			last_eth_tx_bytes = new_eth_tx_bytes;
+			delta_rate = (delta_bytes * 8000000) / delta_time;
+			printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
+			       delta_ms, delta_rate/1000000000.0,
+			       rate/1000.0);
+			if (delta_rate < RATE_THRESHOLD) {
+				/* can increase cgroup rate limit, but first
+				 * check if we are using the current limit.
+				 * Currently increasing by 6.25%, unknown
+				 * if that is the optimal rate.
+				 */
+				int rate_diff100;
+
+				delta_bytes = new_cg_tx_bytes -
+					last_cg_tx_bytes;
+				last_cg_tx_bytes = new_cg_tx_bytes;
+				delta_rate = (delta_bytes * 8000000) /
+					delta_time;
+				printf(" rate:%.3fGbps",
+				       delta_rate/1000000000.0);
+				rate_diff100 = (((long long)rate)*1000000 -
+						     delta_rate) * 100 /
+					(((long long) rate) * 1000000);
+				printf("  rdiff:%d", rate_diff100);
+				if (rate_diff100  <= 3) {
+					rate += (rate >> 4);
+					if (rate > RATE_THRESHOLD / 1000000)
+						rate = RATE_THRESHOLD / 1000000;
+					qstats.rate = rate;
+					printf(" INC\n");
+				} else {
+					printf("\n");
+				}
+			} else {
+				/* Need to decrease cgroup rate limit.
+				 * Currently decreasing by 12.5%, unknown
+				 * if that is optimal
+				 */
+				printf(" DEC\n");
+				rate -= (rate >> 3);
+				if (rate < minRate)
+					rate = minRate;
+				qstats.rate = rate;
+			}
+			if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY))
+				do_error("update map element fails", false);
+		}
+	} else {
+		sleep(dur);
+	}
+	// Get stats!
+	if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) {
+		char fname[100];
+		FILE *fout;
+
+		if (!outFlag)
+			sprintf(fname, "hbm.%d.in", cg_id);
+		else
+			sprintf(fname, "hbm.%d.out", cg_id);
+		fout = fopen(fname, "w");
+		fprintf(fout, "id:%d\n", cg_id);
+		fprintf(fout, "ERROR: Could not lookup queue_stats\n");
+	} else if (stats_flag && qstats.lastPacketTime >
+		   qstats.firstPacketTime) {
+		long long delta_us = (qstats.lastPacketTime -
+				      qstats.firstPacketTime)/1000;
+		unsigned int rate_mbps = ((qstats.bytes_total -
+					   qstats.bytes_dropped) * 8 /
+					  delta_us);
+		double percent_pkts, percent_bytes;
+		char fname[100];
+		FILE *fout;
+
+// Future support of ingress
+//		if (!outFlag)
+//			sprintf(fname, "hbm.%d.in", cg_id);
+//		else
+		sprintf(fname, "hbm.%d.out", cg_id);
+		fout = fopen(fname, "w");
+		fprintf(fout, "id:%d\n", cg_id);
+		fprintf(fout, "rate_mbps:%d\n", rate_mbps);
+		fprintf(fout, "duration:%.1f secs\n",
+			(qstats.lastPacketTime - qstats.firstPacketTime) /
+			1000000000.0);
+		fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
+		fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
+						     1000000));
+		fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
+		fprintf(fout, "bytes_dropped_MB:%d\n",
+			(int)(qstats.bytes_dropped /
+						       1000000));
+		// Marked Pkts and Bytes
+		percent_pkts = (qstats.pkts_marked * 100.0) /
+			(qstats.pkts_total + 1);
+		percent_bytes = (qstats.bytes_marked * 100.0) /
+			(qstats.bytes_total + 1);
+		fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
+		fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
+
+		// Dropped Pkts and Bytes
+		percent_pkts = (qstats.pkts_dropped * 100.0) /
+			(qstats.pkts_total + 1);
+		percent_bytes = (qstats.bytes_dropped * 100.0) /
+			(qstats.bytes_total + 1);
+		fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
+		fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
+		fclose(fout);
+	}
+
+	if (debugFlag)
+		read_trace_pipe2();
+	return rc;
+err:
+	rc = 1;
+
+	if (cg1)
+		close(cg1);
+	cleanup_cgroup_environment();
+
+	return rc;
+}
+
+static void Usage(void)
+{
+	printf("This program loads a cgroup skb BPF program to enforce\n"
+	       "cgroup output (egress) bandwidth limits.\n\n"
+	       "USAGE: hbm [-o] [-d]  [-l] [-n <id>] [-r <rate>] [-s]\n"
+	       "           [-t <secs>] [-w] [-h] [prog]\n"
+	       "  Where:\n"
+	       "    -o         indicates egress direction (default)\n"
+	       "    -d         print BPF trace debug buffer\n"
+	       "    -l         also limit flows using loopback\n"
+	       "    -n <#>     to create cgroup \"/hbm#\" and attach prog\n"
+	       "               Default is /hbm1\n"
+	       "    -r <rate>  Rate in Mbps\n"
+	       "    -s         Update HBM stats\n"
+	       "    -t <time>  Exit after specified seconds (deault is 0)\n"
+	       "    -w	       Work conserving flag. cgroup can increase\n"
+	       "               bandwidth beyond the rate limit specified\n"
+	       "               while there is available bandwidth. Current\n"
+	       "               implementation assumes there is only eth0\n"
+	       "               but can be extended to support multiple NICs\n"
+	       "    -h         print this info\n"
+	       "    prog       BPF program file name. Name defaults to\n"
+	       "                 hbm_out_kern.o\n");
+}
+
+int main(int argc, char **argv)
+{
+	char *prog = "hbm_out_kern.o";
+	int  k;
+	int cg_id = 1;
+	char *optstring = "iodln:r:st:wh";
+
+	while ((k = getopt(argc, argv, optstring)) != -1) {
+		switch (k) {
+		case'o':
+			break;
+		case 'd':
+			debugFlag = true;
+			break;
+		case 'l':
+			loopback_flag = true;
+			break;
+		case 'n':
+			cg_id = atoi(optarg);
+			break;
+		case 'r':
+			minRate = atoi(optarg) * 1.024;
+			rate = minRate;
+			break;
+		case 's':
+			stats_flag = true;
+			break;
+		case 't':
+			dur = atoi(optarg);
+			break;
+		case 'w':
+			work_conserving_flag = true;
+			break;
+		case '?':
+			if (optopt == 'n' || optopt == 'r' || optopt == 't')
+				fprintf(stderr,
+					"Option -%c requires an argument.\n\n",
+					optopt);
+		case 'h':
+			// fallthrough
+		default:
+			Usage();
+			return 0;
+		}
+	}
+
+	if (optind < argc)
+		prog = argv[optind];
+	printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
+
+	return run_bpf_prog(prog, cg_id);
+}
-- 
cgit v1.2.3-59-g8ed1b


From 4ffd44cfd147d157406a26c995cd0c373bffd7a0 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:50 -0800
Subject: bpf: HBM test script

Script for testing HBM (Host Bandwidth Manager) framework.
It creates a cgroup to use for testing and load a BPF program to limit
egress bandwidht. It then uses iperf3 or netperf to create
loads. The output is the goodput in Mbps (unless -D is used).

It can work on a single host using loopback or among two hosts (with netperf).
When using loopback, it is recommended to also introduce a delay of at least
1ms (-d=1), otherwise the assigned bandwidth is likely to be underutilized.

USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]
             [-d=<delay>|--delay=<delay>] [--debug] [-E]
             [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >] [-l]
	     [-N] [-p=<port>|--port=<port>] [-P] [-q=<qdisc>]
             [-R] [-s=<server>|--server=<server] [--stats]
	     [-t=<time>|--time=<time>] [-w] [cubic|dctcp]
  Where:
    out               Egress (default egress)
    -b or --bpf       BPF program filename to load and attach.
                      Default is nrm_out_kern.o for egress,
    -c or -cc         TCP congestion control (cubic or dctcp)
    -d or --delay     Add a delay in ms using netem
    -D                In addition to the goodput in Mbps, it also outputs
                      other detailed information. This information is
                      test dependent (i.e. iperf3 or netperf).
    --debug           Print BPF trace buffer
    -E                Enable ECN (not required for dctcp)
    -f or --flows     Number of concurrent flows (default=1)
    -i or --id        cgroup id (an integer, default is 1)
    -l                Do not limit flows using loopback
    -N                Use netperf instead of iperf3
    -h                Help
    -p or --port      iperf3 port (default is 5201)
    -P                Use an iperf3 instance for each flow
    -q                Use the specified qdisc.
    -r or --rate      Rate in Mbps (default 1s 1Gbps)
    -R                Use TCP_RR for netperf. 1st flow has req
                      size of 10KB, rest of 1MB. Reply in all
                      cases is 1 byte.
                      More detailed output for each flow can be found
                      in the files netperf.<cg>.<flow>, where <cg> is the
                      cgroup id as specified with the -i flag, and <flow>
                      is the flow id starting at 1 and increasing by 1 for
                      flow (as specified by -f).
    -s or --server    hostname of netperf server. Used to create netperf
                      test traffic between to hosts (default is within host)
                      netserver must be running on the host.
    --stats           Get HBM stats (marked, dropped, etc.)
    -t or --time      duration of iperf3 in seconds (default=5)
    -w                Work conserving flag. cgroup can increase its
                      bandwidth beyond the rate limit specified
                      while there is available bandwidth. Current
                      implementation assumes there is only one NIC
                      (eth0), but can be extended to support multiple
                      NICs. This is just a proof of concept.
    cubic or dctcp    specify TCP CC to use

Examples:
 ./do_hbm_test.sh -l -d=1 -D --stats
     Runs a 5 second test, using a single iperf3 flow and with the default
     rate limit of 1Gbps and a delay of 1ms (using netem) using the default
     TCP congestion control on the loopback device (hence we use "-l" to
     enforce bandwidth limit on loopback device). Since no direction is
     specified, it defaults to egress. Since no TCP CC algorithm is
     specified it uses the system default (Cubic for this test).
     With no -D flag, only the value of the AGGREGATE OUTPUT would show.
     id refers to the cgroup id and is useful when running multi cgroup
     tests (supported by a future patch).
     This patchset does not support calling TCP's congesion window
     reduction, even when packets are dropped by the BPF program, resulting
     in a large number of packets dropped. It is recommended that the  current
     HBM implemenation only be used with ECN enabled flows. A future patch
     will add support for reducing TCP's cwnd and will increase the
     performance of non-ECN enabled flows.
   Output:
     Details for HBM in cgroup 1
     id:1
     rate_mbps:493
     duration:4.8 secs
     packets:11355
     bytes_MB:590
     pkts_dropped:4497
     bytes_dropped_MB:292
     pkts_marked_percent: 39.60
     bytes_marked_percent: 49.49
     pkts_dropped_percent: 39.60
     bytes_dropped_percent: 49.49
     PING AVG DELAY:2.075
     AGGREGATE_GOODPUT:505

./do_nrm_test.sh -l -d=1 -D --stats dctcp
     Same as above but using dctcp. Note that fewer bytes are dropped
     (0.01% vs. 49%).
   Output:
     Details for HBM in cgroup 1
     id:1
     rate_mbps:945
     duration:4.9 secs
     packets:16859
     bytes_MB:578
     pkts_dropped:1
     bytes_dropped_MB:0
     pkts_marked_percent: 28.74
     bytes_marked_percent: 45.15
     pkts_dropped_percent:  0.01
     bytes_dropped_percent:  0.01
     PING AVG DELAY:2.083
     AGGREGATE_GOODPUT:965

./do_nrm_test.sh -d=1 -D --stats
     As first example, but without limiting loopback device (i.e. no
     "-l" flag). Since there is no bandwidth limiting, no details for
     HBM are printed out.
   Output:
     Details for HBM in cgroup 1
     PING AVG DELAY:2.019
     AGGREGATE_GOODPUT:42655

./do_hbm.sh -l -d=1 -D --stats -f=2
     Uses iper3 and does 2 flows
./do_hbm.sh -l -d=1 -D --stats -f=4 -P
     Uses iperf3 and does 4 flows, each flow as a separate process.
./do_hbm.sh -l -d=1 -D --stats -f=4 -N
     Uses netperf, 4 flows
./do_hbm.sh -f=1 -r=2000 -t=5 -N -D --stats dctcp -s=<server-name>
     Uses netperf between two hosts. The remote host name is specified
     with -s= and you need to start the program netserver manually on
     the remote host. It will use 1 flow, a rate limit of 2Gbps and dctcp.
./do_hbm.sh -f=1 -r=2000 -t=5 -N -D --stats -w dctcp \
     -s=<server-name>
     As previous, but allows use of extra bandwidth. For this test the
     rate is 8Gbps vs. 1Gbps of the previous test.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/do_hbm_test.sh | 436 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 436 insertions(+)
 create mode 100755 samples/bpf/do_hbm_test.sh

diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
new file mode 100755
index 000000000000..56c8b4115c95
--- /dev/null
+++ b/samples/bpf/do_hbm_test.sh
@@ -0,0 +1,436 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2019 Facebook
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+
+Usage() {
+  echo "Script for testing HBM (Host Bandwidth Manager) framework."
+  echo "It creates a cgroup to use for testing and load a BPF program to limit"
+  echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
+  echo "loads. The output is the goodput in Mbps (unless -D was used)."
+  echo ""
+  echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]"
+  echo "             [-d=<delay>|--delay=<delay>] [--debug] [-E]"
+  echo "             [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
+  echo "             [-l] [-N] [-p=<port>|--port=<port>] [-P]"
+  echo "             [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
+  echo "             [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
+  echo "  Where:"
+  echo "    out               egress (default)"
+  echo "    -b or --bpf       BPF program filename to load and attach."
+  echo "                      Default is hbm_out_kern.o for egress,"
+  echo "    -c or -cc         TCP congestion control (cubic or dctcp)"
+  echo "    --debug           print BPF trace buffer"
+  echo "    -d or --delay     add a delay in ms using netem"
+  echo "    -D                In addition to the goodput in Mbps, it also outputs"
+  echo "                      other detailed information. This information is"
+  echo "                      test dependent (i.e. iperf3 or netperf)."
+  echo "    -E                enable ECN (not required for dctcp)"
+  echo "    -f or --flows     number of concurrent flows (default=1)"
+  echo "    -i or --id        cgroup id (an integer, default is 1)"
+  echo "    -N                use netperf instead of iperf3"
+  echo "    -l                do not limit flows using loopback"
+  echo "    -h                Help"
+  echo "    -p or --port      iperf3 port (default is 5201)"
+  echo "    -P                use an iperf3 instance for each flow"
+  echo "    -q                use the specified qdisc"
+  echo "    -r or --rate      rate in Mbps (default 1s 1Gbps)"
+  echo "    -R                Use TCP_RR for netperf. 1st flow has req"
+  echo "                      size of 10KB, rest of 1MB. Reply in all"
+  echo "                      cases is 1 byte."
+  echo "                      More detailed output for each flow can be found"
+  echo "                      in the files netperf.<cg>.<flow>, where <cg> is the"
+  echo "                      cgroup id as specified with the -i flag, and <flow>"
+  echo "                      is the flow id starting at 1 and increasing by 1 for"
+  echo "                      flow (as specified by -f)."
+  echo "    -s or --server    hostname of netperf server. Used to create netperf"
+  echo "                      test traffic between to hosts (default is within host)"
+  echo "                      netserver must be running on the host."
+  echo "    -S or --stats     whether to update hbm stats (default is yes)."
+  echo "    -t or --time      duration of iperf3 in seconds (default=5)"
+  echo "    -w                Work conserving flag. cgroup can increase its"
+  echo "                      bandwidth beyond the rate limit specified"
+  echo "                      while there is available bandwidth. Current"
+  echo "                      implementation assumes there is only one NIC"
+  echo "                      (eth0), but can be extended to support multiple"
+  echo "                       NICs."
+  echo "    cubic or dctcp    specify which TCP CC to use"
+  echo " "
+  exit
+}
+
+#set -x
+
+debug_flag=0
+args="$@"
+name="$0"
+netem=0
+cc=x
+dir="-o"
+dir_name="out"
+dur=5
+flows=1
+id=1
+prog=""
+port=5201
+rate=1000
+multi_iperf=0
+flow_cnt=1
+use_netperf=0
+rr=0
+ecn=0
+details=0
+server=""
+qdisc=""
+flags=""
+do_stats=0
+
+function start_hbm () {
+  rm -f hbm.out
+  echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
+  echo " " >> hbm.out
+  ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1  &
+  echo $!
+}
+
+processArgs () {
+  for i in $args ; do
+    case $i in
+    # Support for upcomming ingress rate limiting
+    #in)         # support for upcoming ingress rate limiting
+    #  dir="-i"
+    #  dir_name="in"
+    #  ;;
+    out)
+      dir="-o"
+      dir_name="out"
+      ;;
+    -b=*|--bpf=*)
+      prog="${i#*=}"
+      ;;
+    -c=*|--cc=*)
+      cc="${i#*=}"
+      ;;
+    --debug)
+      flags="$flags -d"
+      debug_flag=1
+      ;;
+    -d=*|--delay=*)
+      netem="${i#*=}"
+      ;;
+    -D)
+      details=1
+      ;;
+    -E)
+     ecn=1
+     ;;
+    # Support for upcomming fq Early Departure Time egress rate limiting
+    #--edt)
+    # prog="hbm_out_edt_kern.o"
+    # qdisc="fq"
+    # ;;
+    -f=*|--flows=*)
+      flows="${i#*=}"
+      ;;
+    -i=*|--id=*)
+      id="${i#*=}"
+      ;;
+    -l)
+      flags="$flags -l"
+      ;;
+    -N)
+      use_netperf=1
+      ;;
+    -p=*|--port=*)
+      port="${i#*=}"
+      ;;
+    -P)
+      multi_iperf=1
+      ;;
+    -q=*)
+      qdisc="${i#*=}"
+      ;;
+    -r=*|--rate=*)
+      rate="${i#*=}"
+      ;;
+    -R)
+      rr=1
+      ;;
+    -s=*|--server=*)
+      server="${i#*=}"
+      ;;
+    -S|--stats)
+      flags="$flags -s"
+      do_stats=1
+      ;;
+    -t=*|--time=*)
+      dur="${i#*=}"
+      ;;
+    -w)
+      flags="$flags -w"
+      ;;
+    cubic)
+      cc=cubic
+      ;;
+    dctcp)
+      cc=dctcp
+      ;;
+    *)
+      echo "Unknown arg:$i"
+      Usage
+      ;;
+    esac
+  done
+}
+
+processArgs
+
+if [ $debug_flag -eq 1 ] ; then
+  rm -f hbm_out.log
+fi
+
+hbm_pid=$(start_hbm)
+usleep 100000
+
+host=`hostname`
+cg_base_dir=/sys/fs/cgroup
+cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"
+
+echo $$ >> $cg_dir/cgroup.procs
+
+ulimit -l unlimited
+
+rm -f ss.out
+rm -f hbm.[0-9]*.$dir_name
+if [ $ecn -ne 0 ] ; then
+  sysctl -w -q -n net.ipv4.tcp_ecn=1
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+  cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
+  if [ "$cc" != "x" ] ; then
+    sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
+  fi
+fi
+
+if [ "$netem" -ne "0" ] ; then
+  if [ "$qdisc" != "" ] ; then
+    echo "WARNING: Ignoring -q options because -d option used"
+  fi
+  tc qdisc del dev lo root > /dev/null 2>&1
+  tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
+elif [ "$qdisc" != "" ] ; then
+  tc qdisc del dev lo root > /dev/null 2>&1
+  tc qdisc add dev lo root $qdisc > /dev/null 2>&1
+fi
+
+n=0
+m=$[$dur * 5]
+hn="::1"
+if [ $use_netperf -ne 0 ] ; then
+  if [ "$server" != "" ] ; then
+    hn=$server
+  fi
+fi
+
+( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &
+
+if [ $use_netperf -ne 0 ] ; then
+  begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
+                   awk '{ print $1 }'`
+  if [ "$begNetserverPid" == "" ] ; then
+    if [ "$server" == "" ] ; then
+      ( ./netserver > /dev/null 2>&1) &
+      usleep 100000
+    fi
+  fi
+  flow_cnt=1
+  if [ "$server" == "" ] ; then
+    np_server=$host
+  else
+    np_server=$server
+  fi
+  if [ "$cc" == "x" ] ; then
+    np_cc=""
+  else
+    np_cc="-K $cc,$cc"
+  fi
+  replySize=1
+  while [ $flow_cnt -le $flows ] ; do
+    if [ $rr -ne 0 ] ; then
+      reqSize=1M
+      if [ $flow_cnt -eq 1 ] ; then
+        reqSize=10K
+      fi
+      if [ "$dir" == "-i" ] ; then
+        replySize=$reqSize
+        reqSize=1
+      fi
+      ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR  -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+    else
+      if [ "$dir" == "-i" ] ; then
+        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+      else
+        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+      fi
+    fi
+    flow_cnt=$[flow_cnt+1]
+  done
+
+# sleep for duration of test (plus some buffer)
+  n=$[dur+2]
+  sleep $n
+
+# force graceful termination of netperf
+  pids=`pgrep netperf`
+  for p in $pids ; do
+    kill -SIGALRM $p
+  done
+
+  flow_cnt=1
+  rate=0
+  if [ $details -ne 0 ] ; then
+    echo ""
+    echo "Details for HBM in cgroup $id"
+    if [ $do_stats -eq 1 ] ; then
+      if [ -e hbm.$id.$dir_name ] ; then
+        cat hbm.$id.$dir_name
+      fi
+    fi
+  fi
+  while [ $flow_cnt -le $flows ] ; do
+    if [ "$dir" == "-i" ] ; then
+      r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+    else
+      r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+    fi
+    echo "rate for flow $flow_cnt: $r"
+    rate=$[rate+r]
+    if [ $details -ne 0 ] ; then
+      echo "-----"
+      echo "Details for cgroup $id, flow $flow_cnt"
+      cat netperf.$id.$flow_cnt
+    fi
+    flow_cnt=$[flow_cnt+1]
+  done
+  if [ $details -ne 0 ] ; then
+    echo ""
+    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+    echo "PING AVG DELAY:$delay"
+    echo "AGGREGATE_GOODPUT:$rate"
+  else
+    echo $rate
+  fi
+elif [ $multi_iperf -eq 0 ] ; then
+  (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+  usleep 100000
+  iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
+  rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
+  rate=`echo $rates | grep -o "[0-9]*$"`
+
+  if [ $details -ne 0 ] ; then
+    echo ""
+    echo "Details for HBM in cgroup $id"
+    if [ $do_stats -eq 1 ] ; then
+      if [ -e hbm.$id.$dir_name ] ; then
+        cat hbm.$id.$dir_name
+      fi
+    fi
+    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+    echo "PING AVG DELAY:$delay"
+    echo "AGGREGATE_GOODPUT:$rate"
+  else
+    echo $rate
+  fi
+else
+  flow_cnt=1
+  while [ $flow_cnt -le $flows ] ; do
+    (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+    ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
+    port=$[port+1]
+    flow_cnt=$[flow_cnt+1]
+  done
+  n=$[dur+1]
+  sleep $n
+  flow_cnt=1
+  rate=0
+  if [ $details -ne 0 ] ; then
+    echo ""
+    echo "Details for HBM in cgroup $id"
+    if [ $do_stats -eq 1 ] ; then
+      if [ -e hbm.$id.$dir_name ] ; then
+        cat hbm.$id.$dir_name
+      fi
+    fi
+  fi
+
+  while [ $flow_cnt -le $flows ] ; do
+    r=`cat iperf3.$id.$flow_cnt`
+#    echo "rate for flow $flow_cnt: $r"
+  if [ $details -ne 0 ] ; then
+    echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
+  fi
+    rate=$[rate+r]
+    flow_cnt=$[flow_cnt+1]
+  done
+  if [ $details -ne 0 ] ; then
+    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+    echo "PING AVG DELAY:$delay"
+    echo "AGGREGATE_GOODPUT:$rate"
+  else
+    echo $rate
+  fi
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+  sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
+fi
+if [ $ecn -ne 0 ] ; then
+  sysctl -w -q -n net.ipv4.tcp_ecn=0
+fi
+if [ "$netem" -ne "0" ] ; then
+  tc qdisc del dev lo root > /dev/null 2>&1
+fi
+
+sleep 2
+
+hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
+if [ "$hbmPid" == "$hbm_pid" ] ; then
+  kill $hbm_pid
+fi
+
+sleep 1
+
+# Detach any BPF programs that may have lingered
+ttx=`bpftool cgroup tree | grep hbm`
+v=2
+for x in $ttx ; do
+    if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then
+	cg=$x ; v=0
+    else
+	if [ $v -eq 0 ] ; then
+	    id=$x ; v=1
+	else
+	    if [ $v -eq 1 ] ; then
+		type=$x ; bpftool cgroup detach $cg $type id $id
+		v=0
+	    fi
+	fi
+    fi
+done
+
+if [ $use_netperf -ne 0 ] ; then
+  if [ "$server" == "" ] ; then
+    if [ "$begNetserverPid" == "" ] ; then
+      netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
+      if [ "$netserverPid" != "" ] ; then
+        kill $netserverPid
+      fi
+    fi
+  fi
+fi
+exit
-- 
cgit v1.2.3-59-g8ed1b


From 3f30658830f3a133ba2136237ea9c8e589344a3b Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 1 Mar 2019 19:42:13 -0800
Subject: selftests: bpf: break up test_progs - preparations

Add new prog_tests directory where tests are supposed to land.
Each prog_tests/<filename>.c is expected to have a global function
with signature 'void test_<filename>(void)'. Makefile automatically
generates prog_tests/tests.h file with entry for each prog_tests file:

	#ifdef DECLARE
	extern void test_<filename>(void);
	...
	#endif

	#ifdef CALL
	test_<filename>();
	...
	#endif

prog_tests/tests.h is included in test_progs.c in two places with
appropriate defines. This scheme allows us to move each function with
a separate patch without breaking anything.

Compared to the recent verifier split, each separate file here is
a compilation unit and test_progs.[ch] is now used as a place to put
some common routines that might be used by multiple tests.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile              | 27 ++++++-
 tools/testing/selftests/bpf/prog_tests/.gitignore |  1 +
 tools/testing/selftests/bpf/test_progs.c          | 87 ++++-------------------
 tools/testing/selftests/bpf/test_progs.h          | 80 +++++++++++++++++++++
 4 files changed, 119 insertions(+), 76 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/.gitignore
 create mode 100644 tools/testing/selftests/bpf/test_progs.h

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index ccffaa0a0787..518cd587cd63 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -165,7 +165,11 @@ $(ALU32_BUILD_DIR)/urandom_read: $(OUTPUT)/urandom_read
 $(ALU32_BUILD_DIR)/test_progs_32: test_progs.c $(ALU32_BUILD_DIR) \
 						$(ALU32_BUILD_DIR)/urandom_read
 	$(CC) $(CFLAGS) -o $(ALU32_BUILD_DIR)/test_progs_32 $< \
-		trace_helpers.c $(OUTPUT)/libbpf.a $(LDLIBS)
+		trace_helpers.c prog_tests/*.c $(OUTPUT)/libbpf.a $(LDLIBS)
+
+$(ALU32_BUILD_DIR)/test_progs_32: $(PROG_TESTS_H)
+$(ALU32_BUILD_DIR)/test_progs_32: CFLAGS += -I$(OUTPUT)
+$(ALU32_BUILD_DIR)/test_progs_32: prog_tests/*.c
 
 $(ALU32_BUILD_DIR)/%.o: progs/%.c $(ALU32_BUILD_DIR) \
 					$(ALU32_BUILD_DIR)/test_progs_32
@@ -196,6 +200,25 @@ ifeq ($(DWARF2BTF),y)
 	$(BTF_PAHOLE) -J $@
 endif
 
+PROG_TESTS_H := $(OUTPUT)/prog_tests/tests.h
+$(OUTPUT)/test_progs: $(PROG_TESTS_H)
+$(OUTPUT)/test_progs: CFLAGS += -I$(OUTPUT)
+$(OUTPUT)/test_progs: prog_tests/*.c
+
+PROG_TESTS_FILES := $(wildcard prog_tests/*.c)
+$(PROG_TESTS_H): $(PROG_TESTS_FILES)
+	$(shell ( cd prog_tests/
+		  echo '/* Generated header, do not edit */'; \
+		  echo '#ifdef DECLARE'; \
+		  ls *.c 2> /dev/null | \
+			sed -e 's@\([^\.]*\)\.c@extern void test_\1(void);@'; \
+		  echo '#endif'; \
+		  echo '#ifdef CALL'; \
+		  ls *.c 2> /dev/null | \
+			sed -e 's@\([^\.]*\)\.c@test_\1();@'; \
+		  echo '#endif' \
+		 ) > $(PROG_TESTS_H))
+
 VERIFIER_TESTS_H := $(OUTPUT)/verifier/tests.h
 $(OUTPUT)/test_verifier: $(VERIFIER_TESTS_H)
 $(OUTPUT)/test_verifier: CFLAGS += -I$(OUTPUT)
@@ -211,4 +234,4 @@ $(OUTPUT)/verifier/tests.h: $(VERIFIER_TEST_FILES)
 		 ) > $(VERIFIER_TESTS_H))
 
 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(ALU32_BUILD_DIR) \
-	$(VERIFIER_TESTS_H)
+	$(VERIFIER_TESTS_H) $(PROG_TESTS_H)
diff --git a/tools/testing/selftests/bpf/prog_tests/.gitignore b/tools/testing/selftests/bpf/prog_tests/.gitignore
new file mode 100644
index 000000000000..45984a364647
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/.gitignore
@@ -0,0 +1 @@
+tests.h
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 87cde42559f7..e3f74fb617c1 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -4,57 +4,13 @@
  * modify it under the terms of version 2 of the GNU General Public
  * License as published by the Free Software Foundation.
  */
-#include <stdio.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <time.h>
-#include <signal.h>
-
-#include <linux/types.h>
-typedef __u16 __sum16;
-#include <arpa/inet.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/tcp.h>
-#include <linux/filter.h>
-#include <linux/perf_event.h>
-#include <linux/unistd.h>
-
-#include <sys/ioctl.h>
-#include <sys/wait.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <linux/bpf.h>
-#include <linux/err.h>
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-
-#include "test_iptunnel_common.h"
-#include "bpf_util.h"
-#include "bpf_endian.h"
+#include "test_progs.h"
 #include "bpf_rlimit.h"
-#include "trace_helpers.h"
-#include "flow_dissector_load.h"
 
-static int error_cnt, pass_cnt;
-static bool jit_enabled;
+int error_cnt, pass_cnt;
+bool jit_enabled;
 
-#define MAGIC_BYTES 123
-
-/* ipv4 test vector */
-static struct {
-	struct ethhdr eth;
-	struct iphdr iph;
-	struct tcphdr tcp;
-} __packed pkt_v4 = {
+struct ipv4_packet pkt_v4 = {
 	.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
 	.iph.ihl = 5,
 	.iph.protocol = IPPROTO_TCP,
@@ -63,12 +19,7 @@ static struct {
 	.tcp.doff = 5,
 };
 
-/* ipv6 test vector */
-static struct {
-	struct ethhdr eth;
-	struct ipv6hdr iph;
-	struct tcphdr tcp;
-} __packed pkt_v6 = {
+struct ipv6_packet pkt_v6 = {
 	.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
 	.iph.nexthdr = IPPROTO_TCP,
 	.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
@@ -76,26 +27,7 @@ static struct {
 	.tcp.doff = 5,
 };
 
-#define _CHECK(condition, tag, duration, format...) ({			\
-	int __ret = !!(condition);					\
-	if (__ret) {							\
-		error_cnt++;						\
-		printf("%s:FAIL:%s ", __func__, tag);			\
-		printf(format);						\
-	} else {							\
-		pass_cnt++;						\
-		printf("%s:PASS:%s %d nsec\n", __func__, tag, duration);\
-	}								\
-	__ret;								\
-})
-
-#define CHECK(condition, tag, format...) \
-	_CHECK(condition, tag, duration, format)
-#define CHECK_ATTR(condition, tag, format...) \
-	_CHECK(condition, tag, tattr.duration, format)
-
-static int bpf_find_map(const char *test, struct bpf_object *obj,
-			const char *name)
+int bpf_find_map(const char *test, struct bpf_object *obj, const char *name)
 {
 	struct bpf_map *map;
 
@@ -2150,12 +2082,19 @@ static void test_signal_pending(enum bpf_prog_type prog_type)
 	signal(SIGALRM, SIG_DFL);
 }
 
+#define DECLARE
+#include <prog_tests/tests.h>
+#undef DECLARE
+
 int main(void)
 {
 	srand(time(NULL));
 
 	jit_enabled = is_jit_enabled();
 
+#define CALL
+#include <prog_tests/tests.h>
+#undef CALL
 	test_pkt_access();
 	test_prog_run_xattr();
 	test_xdp();
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
new file mode 100644
index 000000000000..658efa080fc0
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <time.h>
+#include <signal.h>
+
+#include <linux/types.h>
+typedef __u16 __sum16;
+#include <arpa/inet.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <linux/unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "test_iptunnel_common.h"
+#include "bpf_util.h"
+#include "bpf_endian.h"
+#include "trace_helpers.h"
+#include "flow_dissector_load.h"
+
+extern int error_cnt, pass_cnt;
+extern bool jit_enabled;
+
+#define MAGIC_BYTES 123
+
+/* ipv4 test vector */
+struct ipv4_packet {
+	struct ethhdr eth;
+	struct iphdr iph;
+	struct tcphdr tcp;
+} __packed;
+extern struct ipv4_packet pkt_v4;
+
+/* ipv6 test vector */
+struct ipv6_packet {
+	struct ethhdr eth;
+	struct ipv6hdr iph;
+	struct tcphdr tcp;
+} __packed;
+extern struct ipv6_packet pkt_v6;
+
+#define _CHECK(condition, tag, duration, format...) ({			\
+	int __ret = !!(condition);					\
+	if (__ret) {							\
+		error_cnt++;						\
+		printf("%s:FAIL:%s ", __func__, tag);			\
+		printf(format);						\
+	} else {							\
+		pass_cnt++;						\
+		printf("%s:PASS:%s %d nsec\n", __func__, tag, duration);\
+	}								\
+	__ret;								\
+})
+
+#define CHECK(condition, tag, format...) \
+	_CHECK(condition, tag, duration, format)
+#define CHECK_ATTR(condition, tag, format...) \
+	_CHECK(condition, tag, tattr.duration, format)
+
+int bpf_find_map(const char *test, struct bpf_object *obj, const char *name);
-- 
cgit v1.2.3-59-g8ed1b


From 7395724b404db76fc093fb99b5c957baa409d8ae Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 1 Mar 2019 19:42:14 -0800
Subject: selftests: bpf: break up test_progs - pkt access

Move pkt access prog tests into separate files.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/pkt_access.c  | 29 ++++++++++++
 .../selftests/bpf/prog_tests/pkt_md_access.c       | 24 ++++++++++
 tools/testing/selftests/bpf/test_progs.c           | 51 ----------------------
 3 files changed, 53 insertions(+), 51 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/pkt_access.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/pkt_md_access.c

diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
new file mode 100644
index 000000000000..4ecfd721a044
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_pkt_access(void)
+{
+	const char *file = "./test_pkt_access.o";
+	struct bpf_object *obj;
+	__u32 duration, retval;
+	int err, prog_fd;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, &retval, &duration);
+	CHECK(err || retval, "ipv4",
+	      "err %d errno %d retval %d duration %d\n",
+	      err, errno, retval, duration);
+
+	err = bpf_prog_test_run(prog_fd, 100000, &pkt_v6, sizeof(pkt_v6),
+				NULL, NULL, &retval, &duration);
+	CHECK(err || retval, "ipv6",
+	      "err %d errno %d retval %d duration %d\n",
+	      err, errno, retval, duration);
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
new file mode 100644
index 000000000000..ac0d43435806
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_pkt_md_access(void)
+{
+	const char *file = "./test_pkt_md_access.o";
+	struct bpf_object *obj;
+	__u32 duration, retval;
+	int err, prog_fd;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	err = bpf_prog_test_run(prog_fd, 10, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, &retval, &duration);
+	CHECK(err || retval, "",
+	      "err %d errno %d retval %d duration %d\n",
+	      err, errno, retval, duration);
+
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index e3f74fb617c1..4f5709615749 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -40,33 +40,6 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name)
 	return bpf_map__fd(map);
 }
 
-static void test_pkt_access(void)
-{
-	const char *file = "./test_pkt_access.o";
-	struct bpf_object *obj;
-	__u32 duration, retval;
-	int err, prog_fd;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
-	if (err) {
-		error_cnt++;
-		return;
-	}
-
-	err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
-				NULL, NULL, &retval, &duration);
-	CHECK(err || retval, "ipv4",
-	      "err %d errno %d retval %d duration %d\n",
-	      err, errno, retval, duration);
-
-	err = bpf_prog_test_run(prog_fd, 100000, &pkt_v6, sizeof(pkt_v6),
-				NULL, NULL, &retval, &duration);
-	CHECK(err || retval, "ipv6",
-	      "err %d errno %d retval %d duration %d\n",
-	      err, errno, retval, duration);
-	bpf_object__close(obj);
-}
-
 static void test_prog_run_xattr(void)
 {
 	const char *file = "./test_pkt_access.o";
@@ -648,28 +621,6 @@ done:
 		bpf_object__close(objs[i]);
 }
 
-static void test_pkt_md_access(void)
-{
-	const char *file = "./test_pkt_md_access.o";
-	struct bpf_object *obj;
-	__u32 duration, retval;
-	int err, prog_fd;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
-	if (err) {
-		error_cnt++;
-		return;
-	}
-
-	err = bpf_prog_test_run(prog_fd, 10, &pkt_v4, sizeof(pkt_v4),
-				NULL, NULL, &retval, &duration);
-	CHECK(err || retval, "",
-	      "err %d errno %d retval %d duration %d\n",
-	      err, errno, retval, duration);
-
-	bpf_object__close(obj);
-}
-
 static void test_obj_name(void)
 {
 	struct {
@@ -2095,7 +2046,6 @@ int main(void)
 #define CALL
 #include <prog_tests/tests.h>
 #undef CALL
-	test_pkt_access();
 	test_prog_run_xattr();
 	test_xdp();
 	test_xdp_adjust_tail();
@@ -2103,7 +2053,6 @@ int main(void)
 	test_xdp_noinline();
 	test_tcp_estats();
 	test_bpf_obj_id();
-	test_pkt_md_access();
 	test_obj_name();
 	test_tp_attach_query();
 	test_stacktrace_map();
-- 
cgit v1.2.3-59-g8ed1b


From eaf90740f5ad054b1962f32a50e5b7476c0f16bd Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 1 Mar 2019 19:42:15 -0800
Subject: selftests: bpf: break up test_progs - xdp

Move xdp prog tests into separate files.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/xdp.c       |  46 ++++++
 .../selftests/bpf/prog_tests/xdp_adjust_tail.c     |  31 ++++
 .../selftests/bpf/prog_tests/xdp_noinline.c        |  82 +++++++++++
 tools/testing/selftests/bpf/test_progs.c           | 162 ---------------------
 tools/testing/selftests/bpf/test_progs.h           |   4 +
 5 files changed, 163 insertions(+), 162 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_noinline.c

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c
new file mode 100644
index 000000000000..a74167289545
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_xdp(void)
+{
+	struct vip key4 = {.protocol = 6, .family = AF_INET};
+	struct vip key6 = {.protocol = 6, .family = AF_INET6};
+	struct iptnl_info value4 = {.family = AF_INET};
+	struct iptnl_info value6 = {.family = AF_INET6};
+	const char *file = "./test_xdp.o";
+	struct bpf_object *obj;
+	char buf[128];
+	struct ipv6hdr *iph6 = (void *)buf + sizeof(struct ethhdr);
+	struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+	__u32 duration, retval, size;
+	int err, prog_fd, map_fd;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	map_fd = bpf_find_map(__func__, obj, "vip2tnl");
+	if (map_fd < 0)
+		goto out;
+	bpf_map_update_elem(map_fd, &key4, &value4, 0);
+	bpf_map_update_elem(map_fd, &key6, &value6, 0);
+
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				buf, &size, &retval, &duration);
+
+	CHECK(err || retval != XDP_TX || size != 74 ||
+	      iph->protocol != IPPROTO_IPIP, "ipv4",
+	      "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
+				buf, &size, &retval, &duration);
+	CHECK(err || retval != XDP_TX || size != 114 ||
+	      iph6->nexthdr != IPPROTO_IPV6, "ipv6",
+	      "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+out:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
new file mode 100644
index 000000000000..922aa0a19764
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_xdp_adjust_tail(void)
+{
+	const char *file = "./test_adjust_tail.o";
+	struct bpf_object *obj;
+	char buf[128];
+	__u32 duration, retval, size;
+	int err, prog_fd;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				buf, &size, &retval, &duration);
+
+	CHECK(err || retval != XDP_DROP,
+	      "ipv4", "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
+				buf, &size, &retval, &duration);
+	CHECK(err || retval != XDP_TX || size != 54,
+	      "ipv6", "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
new file mode 100644
index 000000000000..09e6b46f5515
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_xdp_noinline(void)
+{
+	const char *file = "./test_xdp_noinline.o";
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct vip key = {.protocol = 6};
+	struct vip_meta {
+		__u32 flags;
+		__u32 vip_num;
+	} value = {.vip_num = VIP_NUM};
+	__u32 stats_key = VIP_NUM;
+	struct vip_stats {
+		__u64 bytes;
+		__u64 pkts;
+	} stats[nr_cpus];
+	struct real_definition {
+		union {
+			__be32 dst;
+			__be32 dstv6[4];
+		};
+		__u8 flags;
+	} real_def = {.dst = MAGIC_VAL};
+	__u32 ch_key = 11, real_num = 3;
+	__u32 duration, retval, size;
+	int err, i, prog_fd, map_fd;
+	__u64 bytes = 0, pkts = 0;
+	struct bpf_object *obj;
+	char buf[128];
+	u32 *magic = (u32 *)buf;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	map_fd = bpf_find_map(__func__, obj, "vip_map");
+	if (map_fd < 0)
+		goto out;
+	bpf_map_update_elem(map_fd, &key, &value, 0);
+
+	map_fd = bpf_find_map(__func__, obj, "ch_rings");
+	if (map_fd < 0)
+		goto out;
+	bpf_map_update_elem(map_fd, &ch_key, &real_num, 0);
+
+	map_fd = bpf_find_map(__func__, obj, "reals");
+	if (map_fd < 0)
+		goto out;
+	bpf_map_update_elem(map_fd, &real_num, &real_def, 0);
+
+	err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
+				buf, &size, &retval, &duration);
+	CHECK(err || retval != 1 || size != 54 ||
+	      *magic != MAGIC_VAL, "ipv4",
+	      "err %d errno %d retval %d size %d magic %x\n",
+	      err, errno, retval, size, *magic);
+
+	err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
+				buf, &size, &retval, &duration);
+	CHECK(err || retval != 1 || size != 74 ||
+	      *magic != MAGIC_VAL, "ipv6",
+	      "err %d errno %d retval %d size %d magic %x\n",
+	      err, errno, retval, size, *magic);
+
+	map_fd = bpf_find_map(__func__, obj, "stats");
+	if (map_fd < 0)
+		goto out;
+	bpf_map_lookup_elem(map_fd, &stats_key, stats);
+	for (i = 0; i < nr_cpus; i++) {
+		bytes += stats[i].bytes;
+		pkts += stats[i].pkts;
+	}
+	if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
+		error_cnt++;
+		printf("test_xdp_noinline:FAIL:stats %lld %lld\n", bytes, pkts);
+	}
+out:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 4f5709615749..1f48bf400c66 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -87,85 +87,6 @@ static void test_prog_run_xattr(void)
 	bpf_object__close(obj);
 }
 
-static void test_xdp(void)
-{
-	struct vip key4 = {.protocol = 6, .family = AF_INET};
-	struct vip key6 = {.protocol = 6, .family = AF_INET6};
-	struct iptnl_info value4 = {.family = AF_INET};
-	struct iptnl_info value6 = {.family = AF_INET6};
-	const char *file = "./test_xdp.o";
-	struct bpf_object *obj;
-	char buf[128];
-	struct ipv6hdr *iph6 = (void *)buf + sizeof(struct ethhdr);
-	struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
-	__u32 duration, retval, size;
-	int err, prog_fd, map_fd;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
-	if (err) {
-		error_cnt++;
-		return;
-	}
-
-	map_fd = bpf_find_map(__func__, obj, "vip2tnl");
-	if (map_fd < 0)
-		goto out;
-	bpf_map_update_elem(map_fd, &key4, &value4, 0);
-	bpf_map_update_elem(map_fd, &key6, &value6, 0);
-
-	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-				buf, &size, &retval, &duration);
-
-	CHECK(err || retval != XDP_TX || size != 74 ||
-	      iph->protocol != IPPROTO_IPIP, "ipv4",
-	      "err %d errno %d retval %d size %d\n",
-	      err, errno, retval, size);
-
-	err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
-				buf, &size, &retval, &duration);
-	CHECK(err || retval != XDP_TX || size != 114 ||
-	      iph6->nexthdr != IPPROTO_IPV6, "ipv6",
-	      "err %d errno %d retval %d size %d\n",
-	      err, errno, retval, size);
-out:
-	bpf_object__close(obj);
-}
-
-static void test_xdp_adjust_tail(void)
-{
-	const char *file = "./test_adjust_tail.o";
-	struct bpf_object *obj;
-	char buf[128];
-	__u32 duration, retval, size;
-	int err, prog_fd;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
-	if (err) {
-		error_cnt++;
-		return;
-	}
-
-	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-				buf, &size, &retval, &duration);
-
-	CHECK(err || retval != XDP_DROP,
-	      "ipv4", "err %d errno %d retval %d size %d\n",
-	      err, errno, retval, size);
-
-	err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
-				buf, &size, &retval, &duration);
-	CHECK(err || retval != XDP_TX || size != 54,
-	      "ipv6", "err %d errno %d retval %d size %d\n",
-	      err, errno, retval, size);
-	bpf_object__close(obj);
-}
-
-
-
-#define MAGIC_VAL 0x1234
-#define NUM_ITER 100000
-#define VIP_NUM 5
-
 static void test_l4lb(const char *file)
 {
 	unsigned int nr_cpus = bpf_num_possible_cpus();
@@ -254,86 +175,6 @@ static void test_l4lb_all(void)
 	test_l4lb(file2);
 }
 
-static void test_xdp_noinline(void)
-{
-	const char *file = "./test_xdp_noinline.o";
-	unsigned int nr_cpus = bpf_num_possible_cpus();
-	struct vip key = {.protocol = 6};
-	struct vip_meta {
-		__u32 flags;
-		__u32 vip_num;
-	} value = {.vip_num = VIP_NUM};
-	__u32 stats_key = VIP_NUM;
-	struct vip_stats {
-		__u64 bytes;
-		__u64 pkts;
-	} stats[nr_cpus];
-	struct real_definition {
-		union {
-			__be32 dst;
-			__be32 dstv6[4];
-		};
-		__u8 flags;
-	} real_def = {.dst = MAGIC_VAL};
-	__u32 ch_key = 11, real_num = 3;
-	__u32 duration, retval, size;
-	int err, i, prog_fd, map_fd;
-	__u64 bytes = 0, pkts = 0;
-	struct bpf_object *obj;
-	char buf[128];
-	u32 *magic = (u32 *)buf;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
-	if (err) {
-		error_cnt++;
-		return;
-	}
-
-	map_fd = bpf_find_map(__func__, obj, "vip_map");
-	if (map_fd < 0)
-		goto out;
-	bpf_map_update_elem(map_fd, &key, &value, 0);
-
-	map_fd = bpf_find_map(__func__, obj, "ch_rings");
-	if (map_fd < 0)
-		goto out;
-	bpf_map_update_elem(map_fd, &ch_key, &real_num, 0);
-
-	map_fd = bpf_find_map(__func__, obj, "reals");
-	if (map_fd < 0)
-		goto out;
-	bpf_map_update_elem(map_fd, &real_num, &real_def, 0);
-
-	err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
-				buf, &size, &retval, &duration);
-	CHECK(err || retval != 1 || size != 54 ||
-	      *magic != MAGIC_VAL, "ipv4",
-	      "err %d errno %d retval %d size %d magic %x\n",
-	      err, errno, retval, size, *magic);
-
-	err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
-				buf, &size, &retval, &duration);
-	CHECK(err || retval != 1 || size != 74 ||
-	      *magic != MAGIC_VAL, "ipv6",
-	      "err %d errno %d retval %d size %d magic %x\n",
-	      err, errno, retval, size, *magic);
-
-	map_fd = bpf_find_map(__func__, obj, "stats");
-	if (map_fd < 0)
-		goto out;
-	bpf_map_lookup_elem(map_fd, &stats_key, stats);
-	for (i = 0; i < nr_cpus; i++) {
-		bytes += stats[i].bytes;
-		pkts += stats[i].pkts;
-	}
-	if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
-		error_cnt++;
-		printf("test_xdp_noinline:FAIL:stats %lld %lld\n", bytes, pkts);
-	}
-out:
-	bpf_object__close(obj);
-}
-
 static void test_tcp_estats(void)
 {
 	const char *file = "./test_tcp_estats.o";
@@ -2047,10 +1888,7 @@ int main(void)
 #include <prog_tests/tests.h>
 #undef CALL
 	test_prog_run_xattr();
-	test_xdp();
-	test_xdp_adjust_tail();
 	test_l4lb_all();
-	test_xdp_noinline();
 	test_tcp_estats();
 	test_bpf_obj_id();
 	test_obj_name();
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 658efa080fc0..040132877680 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -77,4 +77,8 @@ extern struct ipv6_packet pkt_v6;
 #define CHECK_ATTR(condition, tag, format...) \
 	_CHECK(condition, tag, tattr.duration, format)
 
+#define MAGIC_VAL 0x1234
+#define NUM_ITER 100000
+#define VIP_NUM 5
+
 int bpf_find_map(const char *test, struct bpf_object *obj, const char *name);
-- 
cgit v1.2.3-59-g8ed1b


From 615741d81de6c16aa466c4eb37805caa868a9bb8 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 1 Mar 2019 19:42:16 -0800
Subject: selftests: bpf: break up test_progs - stackmap

Move stackmap prog tests into separate files.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/stacktrace_build_id.c | 165 +++++++
 .../bpf/prog_tests/stacktrace_build_id_nmi.c       | 150 +++++++
 .../selftests/bpf/prog_tests/stacktrace_map.c      | 103 +++++
 .../bpf/prog_tests/stacktrace_map_raw_tp.c         |  59 +++
 tools/testing/selftests/bpf/test_progs.c           | 479 +--------------------
 tools/testing/selftests/bpf/test_progs.h           |   3 +
 6 files changed, 483 insertions(+), 476 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c

diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
new file mode 100644
index 000000000000..3aab2b083c71
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_stacktrace_build_id(void)
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+	const char *file = "./test_stacktrace_build_id.o";
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
+	struct perf_event_attr attr = {};
+	__u32 key, previous_key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+	int i, j;
+	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+	int build_id_matches = 0;
+	int retry = 1;
+
+retry:
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		goto out;
+
+	/* Get the ID for the sched/sched_switch tracepoint */
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/random/urandom_read/id");
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
+		  "read", "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	/* Open the perf event and attach bpf progrram */
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+		  err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
+	       == 0);
+	assert(system("./urandom_read") == 0);
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = extract_build_id(buf, 256);
+
+	if (CHECK(err, "get build_id with readelf",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+	if (CHECK(err, "get_next_key from stackmap",
+		  "err %d, errno %d\n", err, errno))
+		goto disable_pmu;
+
+	do {
+		char build_id[64];
+
+		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+		if (CHECK(err, "lookup_elem from stackmap",
+			  "err %d, errno %d\n", err, errno))
+			goto disable_pmu;
+		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+			    id_offs[i].offset != 0) {
+				for (j = 0; j < 20; ++j)
+					sprintf(build_id + 2 * j, "%02x",
+						id_offs[i].build_id[j] & 0xff);
+				if (strstr(buf, build_id) != NULL)
+					build_id_matches = 1;
+			}
+		previous_key = key;
+	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+	/* stack_map_get_build_id_offset() is racy and sometimes can return
+	 * BPF_STACK_BUILD_ID_IP instead of BPF_STACK_BUILD_ID_VALID;
+	 * try it one more time.
+	 */
+	if (build_id_matches < 1 && retry--) {
+		ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+		close(pmu_fd);
+		bpf_object__close(obj);
+		printf("%s:WARN:Didn't find expected build ID from the map, retrying\n",
+		       __func__);
+		goto retry;
+	}
+
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map\n"))
+		goto disable_pmu;
+
+	stack_trace_len = PERF_MAX_STACK_DEPTH
+		* sizeof(struct bpf_stack_build_id);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+	      "err %d errno %d\n", err, errno);
+
+disable_pmu:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+	close(pmu_fd);
+
+close_prog:
+	bpf_object__close(obj);
+
+out:
+	return;
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
new file mode 100644
index 000000000000..8a114bb1c379
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_stacktrace_build_id_nmi(void)
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+	const char *file = "./test_stacktrace_build_id.o";
+	int err, pmu_fd, prog_fd;
+	struct perf_event_attr attr = {
+		.sample_freq = 5000,
+		.freq = 1,
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+	};
+	__u32 key, previous_key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+	int i, j;
+	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+	int build_id_matches = 0;
+	int retry = 1;
+
+retry:
+	err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		return;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open",
+		  "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+		  err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
+	       == 0);
+	assert(system("taskset 0x1 ./urandom_read 100000") == 0);
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = extract_build_id(buf, 256);
+
+	if (CHECK(err, "get build_id with readelf",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+	if (CHECK(err, "get_next_key from stackmap",
+		  "err %d, errno %d\n", err, errno))
+		goto disable_pmu;
+
+	do {
+		char build_id[64];
+
+		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+		if (CHECK(err, "lookup_elem from stackmap",
+			  "err %d, errno %d\n", err, errno))
+			goto disable_pmu;
+		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+			    id_offs[i].offset != 0) {
+				for (j = 0; j < 20; ++j)
+					sprintf(build_id + 2 * j, "%02x",
+						id_offs[i].build_id[j] & 0xff);
+				if (strstr(buf, build_id) != NULL)
+					build_id_matches = 1;
+			}
+		previous_key = key;
+	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+	/* stack_map_get_build_id_offset() is racy and sometimes can return
+	 * BPF_STACK_BUILD_ID_IP instead of BPF_STACK_BUILD_ID_VALID;
+	 * try it one more time.
+	 */
+	if (build_id_matches < 1 && retry--) {
+		ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+		close(pmu_fd);
+		bpf_object__close(obj);
+		printf("%s:WARN:Didn't find expected build ID from the map, retrying\n",
+		       __func__);
+		goto retry;
+	}
+
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map\n"))
+		goto disable_pmu;
+
+	/*
+	 * We intentionally skip compare_stack_ips(). This is because we
+	 * only support one in_nmi() ips-to-build_id translation per cpu
+	 * at any time, thus stack_amap here will always fallback to
+	 * BPF_STACK_BUILD_ID_IP;
+	 */
+
+disable_pmu:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+	close(pmu_fd);
+
+close_prog:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
new file mode 100644
index 000000000000..2bfd50a0d6d1
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_stacktrace_map(void)
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+	const char *file = "./test_stacktrace_map.o";
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
+	struct perf_event_attr attr = {};
+	__u32 key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		return;
+
+	/* Get the ID for the sched/sched_switch tracepoint */
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/sched/sched_switch/id");
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (bytes <= 0 || bytes >= sizeof(buf))
+		goto close_prog;
+
+	/* Open the perf event and attach bpf progrram */
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (err)
+		goto disable_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (err)
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (control_map_fd < 0)
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (stackid_hmap_fd < 0)
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (stackmap_fd < 0)
+		goto disable_pmu;
+
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (stack_amap_fd < 0)
+		goto disable_pmu;
+
+	/* give some time for bpf program run */
+	sleep(1);
+
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu_noerr;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu_noerr;
+
+	stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu_noerr;
+
+	goto disable_pmu_noerr;
+disable_pmu:
+	error_cnt++;
+disable_pmu_noerr:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+	close(pmu_fd);
+close_prog:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
new file mode 100644
index 000000000000..1f8387d80fd7
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_stacktrace_map_raw_tp(void)
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	const char *file = "./test_stacktrace_map.o";
+	int efd, err, prog_fd;
+	__u32 key, val, duration = 0;
+	struct bpf_object *obj;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sched_switch", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (control_map_fd < 0)
+		goto close_prog;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (stackid_hmap_fd < 0)
+		goto close_prog;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (stackmap_fd < 0)
+		goto close_prog;
+
+	/* give some time for bpf program run */
+	sleep(1);
+
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 1f48bf400c66..a342fbe19f86 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -661,7 +661,7 @@ static void test_tp_attach_query(void)
 	free(query);
 }
 
-static int compare_map_keys(int map1_fd, int map2_fd)
+int compare_map_keys(int map1_fd, int map2_fd)
 {
 	__u32 key, next_key;
 	char val_buf[PERF_MAX_STACK_DEPTH *
@@ -688,7 +688,7 @@ static int compare_map_keys(int map1_fd, int map2_fd)
 	return 0;
 }
 
-static int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len)
+int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len)
 {
 	__u32 key, next_key, *cur_key_p, *next_key_p;
 	char *val_buf1, *val_buf2;
@@ -724,165 +724,7 @@ out:
 	return err;
 }
 
-static void test_stacktrace_map()
-{
-	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
-	const char *file = "./test_stacktrace_map.o";
-	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
-	struct perf_event_attr attr = {};
-	__u32 key, val, duration = 0;
-	struct bpf_object *obj;
-	char buf[256];
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
-		return;
-
-	/* Get the ID for the sched/sched_switch tracepoint */
-	snprintf(buf, sizeof(buf),
-		 "/sys/kernel/debug/tracing/events/sched/sched_switch/id");
-	efd = open(buf, O_RDONLY, 0);
-	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
-		goto close_prog;
-
-	bytes = read(efd, buf, sizeof(buf));
-	close(efd);
-	if (bytes <= 0 || bytes >= sizeof(buf))
-		goto close_prog;
-
-	/* Open the perf event and attach bpf progrram */
-	attr.config = strtol(buf, NULL, 0);
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
-	attr.sample_period = 1;
-	attr.wakeup_events = 1;
-	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
-			 0 /* cpu 0 */, -1 /* group id */,
-			 0 /* flags */);
-	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
-		  pmu_fd, errno))
-		goto close_prog;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
-	if (err)
-		goto disable_pmu;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
-	if (err)
-		goto disable_pmu;
-
-	/* find map fds */
-	control_map_fd = bpf_find_map(__func__, obj, "control_map");
-	if (control_map_fd < 0)
-		goto disable_pmu;
-
-	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
-	if (stackid_hmap_fd < 0)
-		goto disable_pmu;
-
-	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
-	if (stackmap_fd < 0)
-		goto disable_pmu;
-
-	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
-	if (stack_amap_fd < 0)
-		goto disable_pmu;
-
-	/* give some time for bpf program run */
-	sleep(1);
-
-	/* disable stack trace collection */
-	key = 0;
-	val = 1;
-	bpf_map_update_elem(control_map_fd, &key, &val, 0);
-
-	/* for every element in stackid_hmap, we can find a corresponding one
-	 * in stackmap, and vise versa.
-	 */
-	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
-	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu_noerr;
-
-	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
-	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu_noerr;
-
-	stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
-	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
-	if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu_noerr;
-
-	goto disable_pmu_noerr;
-disable_pmu:
-	error_cnt++;
-disable_pmu_noerr:
-	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
-	close(pmu_fd);
-close_prog:
-	bpf_object__close(obj);
-}
-
-static void test_stacktrace_map_raw_tp()
-{
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
-	const char *file = "./test_stacktrace_map.o";
-	int efd, err, prog_fd;
-	__u32 key, val, duration = 0;
-	struct bpf_object *obj;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
-		return;
-
-	efd = bpf_raw_tracepoint_open("sched_switch", prog_fd);
-	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
-		goto close_prog;
-
-	/* find map fds */
-	control_map_fd = bpf_find_map(__func__, obj, "control_map");
-	if (control_map_fd < 0)
-		goto close_prog;
-
-	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
-	if (stackid_hmap_fd < 0)
-		goto close_prog;
-
-	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
-	if (stackmap_fd < 0)
-		goto close_prog;
-
-	/* give some time for bpf program run */
-	sleep(1);
-
-	/* disable stack trace collection */
-	key = 0;
-	val = 1;
-	bpf_map_update_elem(control_map_fd, &key, &val, 0);
-
-	/* for every element in stackid_hmap, we can find a corresponding one
-	 * in stackmap, and vise versa.
-	 */
-	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
-	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
-		  "err %d errno %d\n", err, errno))
-		goto close_prog;
-
-	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
-	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
-		  "err %d errno %d\n", err, errno))
-		goto close_prog;
-
-	goto close_prog_noerr;
-close_prog:
-	error_cnt++;
-close_prog_noerr:
-	bpf_object__close(obj);
-}
-
-static int extract_build_id(char *build_id, size_t size)
+int extract_build_id(char *build_id, size_t size)
 {
 	FILE *fp;
 	char *line = NULL;
@@ -906,317 +748,6 @@ err:
 	return -1;
 }
 
-static void test_stacktrace_build_id(void)
-{
-	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
-	const char *file = "./test_stacktrace_build_id.o";
-	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
-	struct perf_event_attr attr = {};
-	__u32 key, previous_key, val, duration = 0;
-	struct bpf_object *obj;
-	char buf[256];
-	int i, j;
-	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
-	int build_id_matches = 0;
-	int retry = 1;
-
-retry:
-	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
-		goto out;
-
-	/* Get the ID for the sched/sched_switch tracepoint */
-	snprintf(buf, sizeof(buf),
-		 "/sys/kernel/debug/tracing/events/random/urandom_read/id");
-	efd = open(buf, O_RDONLY, 0);
-	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
-		goto close_prog;
-
-	bytes = read(efd, buf, sizeof(buf));
-	close(efd);
-	if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
-		  "read", "bytes %d errno %d\n", bytes, errno))
-		goto close_prog;
-
-	/* Open the perf event and attach bpf progrram */
-	attr.config = strtol(buf, NULL, 0);
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
-	attr.sample_period = 1;
-	attr.wakeup_events = 1;
-	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
-			 0 /* cpu 0 */, -1 /* group id */,
-			 0 /* flags */);
-	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
-		  pmu_fd, errno))
-		goto close_prog;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
-	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
-		  err, errno))
-		goto close_pmu;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
-	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
-		  err, errno))
-		goto disable_pmu;
-
-	/* find map fds */
-	control_map_fd = bpf_find_map(__func__, obj, "control_map");
-	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
-	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
-	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
-		  err, errno))
-		goto disable_pmu;
-
-	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
-	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
-	       == 0);
-	assert(system("./urandom_read") == 0);
-	/* disable stack trace collection */
-	key = 0;
-	val = 1;
-	bpf_map_update_elem(control_map_fd, &key, &val, 0);
-
-	/* for every element in stackid_hmap, we can find a corresponding one
-	 * in stackmap, and vise versa.
-	 */
-	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
-	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
-	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	err = extract_build_id(buf, 256);
-
-	if (CHECK(err, "get build_id with readelf",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
-	if (CHECK(err, "get_next_key from stackmap",
-		  "err %d, errno %d\n", err, errno))
-		goto disable_pmu;
-
-	do {
-		char build_id[64];
-
-		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
-		if (CHECK(err, "lookup_elem from stackmap",
-			  "err %d, errno %d\n", err, errno))
-			goto disable_pmu;
-		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
-			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
-			    id_offs[i].offset != 0) {
-				for (j = 0; j < 20; ++j)
-					sprintf(build_id + 2 * j, "%02x",
-						id_offs[i].build_id[j] & 0xff);
-				if (strstr(buf, build_id) != NULL)
-					build_id_matches = 1;
-			}
-		previous_key = key;
-	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
-
-	/* stack_map_get_build_id_offset() is racy and sometimes can return
-	 * BPF_STACK_BUILD_ID_IP instead of BPF_STACK_BUILD_ID_VALID;
-	 * try it one more time.
-	 */
-	if (build_id_matches < 1 && retry--) {
-		ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
-		close(pmu_fd);
-		bpf_object__close(obj);
-		printf("%s:WARN:Didn't find expected build ID from the map, retrying\n",
-		       __func__);
-		goto retry;
-	}
-
-	if (CHECK(build_id_matches < 1, "build id match",
-		  "Didn't find expected build ID from the map\n"))
-		goto disable_pmu;
-
-	stack_trace_len = PERF_MAX_STACK_DEPTH
-		* sizeof(struct bpf_stack_build_id);
-	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
-	CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
-	      "err %d errno %d\n", err, errno);
-
-disable_pmu:
-	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
-
-close_pmu:
-	close(pmu_fd);
-
-close_prog:
-	bpf_object__close(obj);
-
-out:
-	return;
-}
-
-static void test_stacktrace_build_id_nmi(void)
-{
-	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
-	const char *file = "./test_stacktrace_build_id.o";
-	int err, pmu_fd, prog_fd;
-	struct perf_event_attr attr = {
-		.sample_freq = 5000,
-		.freq = 1,
-		.type = PERF_TYPE_HARDWARE,
-		.config = PERF_COUNT_HW_CPU_CYCLES,
-	};
-	__u32 key, previous_key, val, duration = 0;
-	struct bpf_object *obj;
-	char buf[256];
-	int i, j;
-	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
-	int build_id_matches = 0;
-	int retry = 1;
-
-retry:
-	err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
-	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
-		return;
-
-	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
-			 0 /* cpu 0 */, -1 /* group id */,
-			 0 /* flags */);
-	if (CHECK(pmu_fd < 0, "perf_event_open",
-		  "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
-		  pmu_fd, errno))
-		goto close_prog;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
-	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
-		  err, errno))
-		goto close_pmu;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
-	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
-		  err, errno))
-		goto disable_pmu;
-
-	/* find map fds */
-	control_map_fd = bpf_find_map(__func__, obj, "control_map");
-	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
-	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
-	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
-		  err, errno))
-		goto disable_pmu;
-
-	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
-	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
-	       == 0);
-	assert(system("taskset 0x1 ./urandom_read 100000") == 0);
-	/* disable stack trace collection */
-	key = 0;
-	val = 1;
-	bpf_map_update_elem(control_map_fd, &key, &val, 0);
-
-	/* for every element in stackid_hmap, we can find a corresponding one
-	 * in stackmap, and vise versa.
-	 */
-	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
-	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
-	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	err = extract_build_id(buf, 256);
-
-	if (CHECK(err, "get build_id with readelf",
-		  "err %d errno %d\n", err, errno))
-		goto disable_pmu;
-
-	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
-	if (CHECK(err, "get_next_key from stackmap",
-		  "err %d, errno %d\n", err, errno))
-		goto disable_pmu;
-
-	do {
-		char build_id[64];
-
-		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
-		if (CHECK(err, "lookup_elem from stackmap",
-			  "err %d, errno %d\n", err, errno))
-			goto disable_pmu;
-		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
-			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
-			    id_offs[i].offset != 0) {
-				for (j = 0; j < 20; ++j)
-					sprintf(build_id + 2 * j, "%02x",
-						id_offs[i].build_id[j] & 0xff);
-				if (strstr(buf, build_id) != NULL)
-					build_id_matches = 1;
-			}
-		previous_key = key;
-	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
-
-	/* stack_map_get_build_id_offset() is racy and sometimes can return
-	 * BPF_STACK_BUILD_ID_IP instead of BPF_STACK_BUILD_ID_VALID;
-	 * try it one more time.
-	 */
-	if (build_id_matches < 1 && retry--) {
-		ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
-		close(pmu_fd);
-		bpf_object__close(obj);
-		printf("%s:WARN:Didn't find expected build ID from the map, retrying\n",
-		       __func__);
-		goto retry;
-	}
-
-	if (CHECK(build_id_matches < 1, "build id match",
-		  "Didn't find expected build ID from the map\n"))
-		goto disable_pmu;
-
-	/*
-	 * We intentionally skip compare_stack_ips(). This is because we
-	 * only support one in_nmi() ips-to-build_id translation per cpu
-	 * at any time, thus stack_amap here will always fallback to
-	 * BPF_STACK_BUILD_ID_IP;
-	 */
-
-disable_pmu:
-	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
-
-close_pmu:
-	close(pmu_fd);
-
-close_prog:
-	bpf_object__close(obj);
-}
-
 #define MAX_CNT_RAWTP	10ull
 #define MAX_STACK_RAWTP	100
 struct get_stack_trace_t {
@@ -1893,10 +1424,6 @@ int main(void)
 	test_bpf_obj_id();
 	test_obj_name();
 	test_tp_attach_query();
-	test_stacktrace_map();
-	test_stacktrace_build_id();
-	test_stacktrace_build_id_nmi();
-	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
 	test_task_fd_query_rawtp();
 	test_task_fd_query_tp();
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 040132877680..148b5494ed08 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -82,3 +82,6 @@ extern struct ipv6_packet pkt_v6;
 #define VIP_NUM 5
 
 int bpf_find_map(const char *test, struct bpf_object *obj, const char *name);
+int compare_map_keys(int map1_fd, int map2_fd);
+int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len);
+int extract_build_id(char *build_id, size_t size);
-- 
cgit v1.2.3-59-g8ed1b


From 20cb14ff9c49fcb189daf82246fb6fcd3923542c Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 1 Mar 2019 19:42:17 -0800
Subject: selftests: bpf: break up test_progs - tracepoint

Move tracepoint prog tests into separate files.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/get_stack_raw_tp.c    | 139 +++++++
 .../selftests/bpf/prog_tests/task_fd_query_rawtp.c |  78 ++++
 .../selftests/bpf/prog_tests/task_fd_query_tp.c    |  82 ++++
 .../selftests/bpf/prog_tests/tp_attach_query.c     | 132 +++++++
 tools/testing/selftests/bpf/test_progs.c           | 427 ---------------------
 5 files changed, 431 insertions(+), 427 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/tp_attach_query.c

diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
new file mode 100644
index 000000000000..d7bb5beb1c57
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+#define MAX_CNT_RAWTP	10ull
+#define MAX_STACK_RAWTP	100
+struct get_stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	__u64 kern_stack[MAX_STACK_RAWTP];
+	__u64 user_stack[MAX_STACK_RAWTP];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
+};
+
+static int get_stack_print_output(void *data, int size)
+{
+	bool good_kern_stack = false, good_user_stack = false;
+	const char *nonjit_func = "___bpf_prog_run";
+	struct get_stack_trace_t *e = data;
+	int i, num_stack;
+	static __u64 cnt;
+	struct ksym *ks;
+
+	cnt++;
+
+	if (size < sizeof(struct get_stack_trace_t)) {
+		__u64 *raw_data = data;
+		bool found = false;
+
+		num_stack = size / sizeof(__u64);
+		/* If jit is enabled, we do not have a good way to
+		 * verify the sanity of the kernel stack. So we
+		 * just assume it is good if the stack is not empty.
+		 * This could be improved in the future.
+		 */
+		if (jit_enabled) {
+			found = num_stack > 0;
+		} else {
+			for (i = 0; i < num_stack; i++) {
+				ks = ksym_search(raw_data[i]);
+				if (strcmp(ks->name, nonjit_func) == 0) {
+					found = true;
+					break;
+				}
+			}
+		}
+		if (found) {
+			good_kern_stack = true;
+			good_user_stack = true;
+		}
+	} else {
+		num_stack = e->kern_stack_size / sizeof(__u64);
+		if (jit_enabled) {
+			good_kern_stack = num_stack > 0;
+		} else {
+			for (i = 0; i < num_stack; i++) {
+				ks = ksym_search(e->kern_stack[i]);
+				if (strcmp(ks->name, nonjit_func) == 0) {
+					good_kern_stack = true;
+					break;
+				}
+			}
+		}
+		if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0)
+			good_user_stack = true;
+	}
+	if (!good_kern_stack || !good_user_stack)
+		return LIBBPF_PERF_EVENT_ERROR;
+
+	if (cnt == MAX_CNT_RAWTP)
+		return LIBBPF_PERF_EVENT_DONE;
+
+	return LIBBPF_PERF_EVENT_CONT;
+}
+
+void test_get_stack_raw_tp(void)
+{
+	const char *file = "./test_get_stack_rawtp.o";
+	int i, efd, err, prog_fd, pmu_fd, perfmap_fd;
+	struct perf_event_attr attr = {};
+	struct timespec tv = {0, 10};
+	__u32 key = 0, duration = 0;
+	struct bpf_object *obj;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	perfmap_fd = bpf_find_map(__func__, obj, "perfmap");
+	if (CHECK(perfmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  perfmap_fd, errno))
+		goto close_prog;
+
+	err = load_kallsyms();
+	if (CHECK(err < 0, "load_kallsyms", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, getpid()/*pid*/, -1/*cpu*/,
+			 -1/*group_fd*/, 0);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+		  errno))
+		goto close_prog;
+
+	err = bpf_map_update_elem(perfmap_fd, &key, &pmu_fd, BPF_ANY);
+	if (CHECK(err < 0, "bpf_map_update_elem", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+
+	err = perf_event_mmap(pmu_fd);
+	if (CHECK(err < 0, "perf_event_mmap", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	/* trigger some syscall action */
+	for (i = 0; i < MAX_CNT_RAWTP; i++)
+		nanosleep(&tv, NULL);
+
+	err = perf_event_poller(pmu_fd, get_stack_print_output);
+	if (CHECK(err < 0, "perf_event_poller", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
new file mode 100644
index 000000000000..958a3d88de99
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_task_fd_query_rawtp(void)
+{
+	const char *file = "./test_get_stack_rawtp.o";
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	struct bpf_object *obj;
+	int efd, err, prog_fd;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	/* query (getpid(), efd) */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      strcmp(buf, "sys_enter") == 0;
+	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+		  fd_type, buf))
+		goto close_prog;
+
+	/* test zero len */
+	len = 0;
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == strlen("sys_enter");
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	/* test empty buffer */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), efd, 0, 0, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == strlen("sys_enter");
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	/* test smaller buffer */
+	len = 3;
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 3)",
+		  "err %d errno %d\n", err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == strlen("sys_enter") &&
+	      strcmp(buf, "sy") == 0;
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
new file mode 100644
index 000000000000..d636a4f39476
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+				       const char *tp_name)
+{
+	const char *file = "./test_tracepoint.o";
+	int err, bytes, efd, prog_fd, pmu_fd;
+	struct perf_event_attr attr = {};
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	struct bpf_object *obj;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+		  "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	/* query (getpid(), pmu_fd) */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = (fd_type == BPF_FD_TYPE_TRACEPOINT) && !strcmp(buf, tp_name);
+	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+		  fd_type, buf))
+		goto close_pmu;
+
+	close(pmu_fd);
+	goto close_prog_noerr;
+
+close_pmu:
+	close(pmu_fd);
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
+void test_task_fd_query_tp(void)
+{
+	test_task_fd_query_tp_core("sched/sched_switch",
+				   "sched_switch");
+	test_task_fd_query_tp_core("syscalls/sys_enter_read",
+				   "sys_enter_read");
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
new file mode 100644
index 000000000000..a2f476f91637
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_tp_attach_query(void)
+{
+	const int num_progs = 3;
+	int i, j, bytes, efd, err, prog_fd[num_progs], pmu_fd[num_progs];
+	__u32 duration = 0, info_len, saved_prog_ids[num_progs];
+	const char *file = "./test_tracepoint.o";
+	struct perf_event_query_bpf *query;
+	struct perf_event_attr attr = {};
+	struct bpf_object *obj[num_progs];
+	struct bpf_prog_info prog_info;
+	char buf[256];
+
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/sched/sched_switch/id");
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		return;
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
+		  "read", "bytes %d errno %d\n", bytes, errno))
+		return;
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	query = malloc(sizeof(*query) + sizeof(__u32) * num_progs);
+	for (i = 0; i < num_progs; i++) {
+		err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj[i],
+				    &prog_fd[i]);
+		if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+			goto cleanup1;
+
+		bzero(&prog_info, sizeof(prog_info));
+		prog_info.jited_prog_len = 0;
+		prog_info.xlated_prog_len = 0;
+		prog_info.nr_map_ids = 0;
+		info_len = sizeof(prog_info);
+		err = bpf_obj_get_info_by_fd(prog_fd[i], &prog_info, &info_len);
+		if (CHECK(err, "bpf_obj_get_info_by_fd", "err %d errno %d\n",
+			  err, errno))
+			goto cleanup1;
+		saved_prog_ids[i] = prog_info.id;
+
+		pmu_fd[i] = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+				    0 /* cpu 0 */, -1 /* group id */,
+				    0 /* flags */);
+		if (CHECK(pmu_fd[i] < 0, "perf_event_open", "err %d errno %d\n",
+			  pmu_fd[i], errno))
+			goto cleanup2;
+		err = ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+		if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+			  err, errno))
+			goto cleanup3;
+
+		if (i == 0) {
+			/* check NULL prog array query */
+			query->ids_len = num_progs;
+			err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
+			if (CHECK(err || query->prog_cnt != 0,
+				  "perf_event_ioc_query_bpf",
+				  "err %d errno %d query->prog_cnt %u\n",
+				  err, errno, query->prog_cnt))
+				goto cleanup3;
+		}
+
+		err = ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[i]);
+		if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+			  err, errno))
+			goto cleanup3;
+
+		if (i == 1) {
+			/* try to get # of programs only */
+			query->ids_len = 0;
+			err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
+			if (CHECK(err || query->prog_cnt != 2,
+				  "perf_event_ioc_query_bpf",
+				  "err %d errno %d query->prog_cnt %u\n",
+				  err, errno, query->prog_cnt))
+				goto cleanup3;
+
+			/* try a few negative tests */
+			/* invalid query pointer */
+			err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF,
+				    (struct perf_event_query_bpf *)0x1);
+			if (CHECK(!err || errno != EFAULT,
+				  "perf_event_ioc_query_bpf",
+				  "err %d errno %d\n", err, errno))
+				goto cleanup3;
+
+			/* no enough space */
+			query->ids_len = 1;
+			err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
+			if (CHECK(!err || errno != ENOSPC || query->prog_cnt != 2,
+				  "perf_event_ioc_query_bpf",
+				  "err %d errno %d query->prog_cnt %u\n",
+				  err, errno, query->prog_cnt))
+				goto cleanup3;
+		}
+
+		query->ids_len = num_progs;
+		err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
+		if (CHECK(err || query->prog_cnt != (i + 1),
+			  "perf_event_ioc_query_bpf",
+			  "err %d errno %d query->prog_cnt %u\n",
+			  err, errno, query->prog_cnt))
+			goto cleanup3;
+		for (j = 0; j < i + 1; j++)
+			if (CHECK(saved_prog_ids[j] != query->ids[j],
+				  "perf_event_ioc_query_bpf",
+				  "#%d saved_prog_id %x query prog_id %x\n",
+				  j, saved_prog_ids[j], query->ids[j]))
+				goto cleanup3;
+	}
+
+	i = num_progs - 1;
+	for (; i >= 0; i--) {
+ cleanup3:
+		ioctl(pmu_fd[i], PERF_EVENT_IOC_DISABLE);
+ cleanup2:
+		close(pmu_fd[i]);
+ cleanup1:
+		bpf_object__close(obj[i]);
+	}
+	free(query);
+}
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index a342fbe19f86..6e41dfab1e75 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -531,136 +531,6 @@ static void test_obj_name(void)
 	}
 }
 
-static void test_tp_attach_query(void)
-{
-	const int num_progs = 3;
-	int i, j, bytes, efd, err, prog_fd[num_progs], pmu_fd[num_progs];
-	__u32 duration = 0, info_len, saved_prog_ids[num_progs];
-	const char *file = "./test_tracepoint.o";
-	struct perf_event_query_bpf *query;
-	struct perf_event_attr attr = {};
-	struct bpf_object *obj[num_progs];
-	struct bpf_prog_info prog_info;
-	char buf[256];
-
-	snprintf(buf, sizeof(buf),
-		 "/sys/kernel/debug/tracing/events/sched/sched_switch/id");
-	efd = open(buf, O_RDONLY, 0);
-	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
-		return;
-	bytes = read(efd, buf, sizeof(buf));
-	close(efd);
-	if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
-		  "read", "bytes %d errno %d\n", bytes, errno))
-		return;
-
-	attr.config = strtol(buf, NULL, 0);
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
-	attr.sample_period = 1;
-	attr.wakeup_events = 1;
-
-	query = malloc(sizeof(*query) + sizeof(__u32) * num_progs);
-	for (i = 0; i < num_progs; i++) {
-		err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj[i],
-				    &prog_fd[i]);
-		if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
-			goto cleanup1;
-
-		bzero(&prog_info, sizeof(prog_info));
-		prog_info.jited_prog_len = 0;
-		prog_info.xlated_prog_len = 0;
-		prog_info.nr_map_ids = 0;
-		info_len = sizeof(prog_info);
-		err = bpf_obj_get_info_by_fd(prog_fd[i], &prog_info, &info_len);
-		if (CHECK(err, "bpf_obj_get_info_by_fd", "err %d errno %d\n",
-			  err, errno))
-			goto cleanup1;
-		saved_prog_ids[i] = prog_info.id;
-
-		pmu_fd[i] = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
-				    0 /* cpu 0 */, -1 /* group id */,
-				    0 /* flags */);
-		if (CHECK(pmu_fd[i] < 0, "perf_event_open", "err %d errno %d\n",
-			  pmu_fd[i], errno))
-			goto cleanup2;
-		err = ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
-		if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
-			  err, errno))
-			goto cleanup3;
-
-		if (i == 0) {
-			/* check NULL prog array query */
-			query->ids_len = num_progs;
-			err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
-			if (CHECK(err || query->prog_cnt != 0,
-				  "perf_event_ioc_query_bpf",
-				  "err %d errno %d query->prog_cnt %u\n",
-				  err, errno, query->prog_cnt))
-				goto cleanup3;
-		}
-
-		err = ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[i]);
-		if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
-			  err, errno))
-			goto cleanup3;
-
-		if (i == 1) {
-			/* try to get # of programs only */
-			query->ids_len = 0;
-			err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
-			if (CHECK(err || query->prog_cnt != 2,
-				  "perf_event_ioc_query_bpf",
-				  "err %d errno %d query->prog_cnt %u\n",
-				  err, errno, query->prog_cnt))
-				goto cleanup3;
-
-			/* try a few negative tests */
-			/* invalid query pointer */
-			err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF,
-				    (struct perf_event_query_bpf *)0x1);
-			if (CHECK(!err || errno != EFAULT,
-				  "perf_event_ioc_query_bpf",
-				  "err %d errno %d\n", err, errno))
-				goto cleanup3;
-
-			/* no enough space */
-			query->ids_len = 1;
-			err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
-			if (CHECK(!err || errno != ENOSPC || query->prog_cnt != 2,
-				  "perf_event_ioc_query_bpf",
-				  "err %d errno %d query->prog_cnt %u\n",
-				  err, errno, query->prog_cnt))
-				goto cleanup3;
-		}
-
-		query->ids_len = num_progs;
-		err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
-		if (CHECK(err || query->prog_cnt != (i + 1),
-			  "perf_event_ioc_query_bpf",
-			  "err %d errno %d query->prog_cnt %u\n",
-			  err, errno, query->prog_cnt))
-			goto cleanup3;
-		for (j = 0; j < i + 1; j++)
-			if (CHECK(saved_prog_ids[j] != query->ids[j],
-				  "perf_event_ioc_query_bpf",
-				  "#%d saved_prog_id %x query prog_id %x\n",
-				  j, saved_prog_ids[j], query->ids[j]))
-				goto cleanup3;
-	}
-
-	i = num_progs - 1;
-	for (; i >= 0; i--) {
- cleanup3:
-		ioctl(pmu_fd[i], PERF_EVENT_IOC_DISABLE);
- cleanup2:
-		close(pmu_fd[i]);
- cleanup1:
-		bpf_object__close(obj[i]);
-	}
-	free(query);
-}
-
 int compare_map_keys(int map1_fd, int map2_fd)
 {
 	__u32 key, next_key;
@@ -748,299 +618,6 @@ err:
 	return -1;
 }
 
-#define MAX_CNT_RAWTP	10ull
-#define MAX_STACK_RAWTP	100
-struct get_stack_trace_t {
-	int pid;
-	int kern_stack_size;
-	int user_stack_size;
-	int user_stack_buildid_size;
-	__u64 kern_stack[MAX_STACK_RAWTP];
-	__u64 user_stack[MAX_STACK_RAWTP];
-	struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
-};
-
-static int get_stack_print_output(void *data, int size)
-{
-	bool good_kern_stack = false, good_user_stack = false;
-	const char *nonjit_func = "___bpf_prog_run";
-	struct get_stack_trace_t *e = data;
-	int i, num_stack;
-	static __u64 cnt;
-	struct ksym *ks;
-
-	cnt++;
-
-	if (size < sizeof(struct get_stack_trace_t)) {
-		__u64 *raw_data = data;
-		bool found = false;
-
-		num_stack = size / sizeof(__u64);
-		/* If jit is enabled, we do not have a good way to
-		 * verify the sanity of the kernel stack. So we
-		 * just assume it is good if the stack is not empty.
-		 * This could be improved in the future.
-		 */
-		if (jit_enabled) {
-			found = num_stack > 0;
-		} else {
-			for (i = 0; i < num_stack; i++) {
-				ks = ksym_search(raw_data[i]);
-				if (strcmp(ks->name, nonjit_func) == 0) {
-					found = true;
-					break;
-				}
-			}
-		}
-		if (found) {
-			good_kern_stack = true;
-			good_user_stack = true;
-		}
-	} else {
-		num_stack = e->kern_stack_size / sizeof(__u64);
-		if (jit_enabled) {
-			good_kern_stack = num_stack > 0;
-		} else {
-			for (i = 0; i < num_stack; i++) {
-				ks = ksym_search(e->kern_stack[i]);
-				if (strcmp(ks->name, nonjit_func) == 0) {
-					good_kern_stack = true;
-					break;
-				}
-			}
-		}
-		if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0)
-			good_user_stack = true;
-	}
-	if (!good_kern_stack || !good_user_stack)
-		return LIBBPF_PERF_EVENT_ERROR;
-
-	if (cnt == MAX_CNT_RAWTP)
-		return LIBBPF_PERF_EVENT_DONE;
-
-	return LIBBPF_PERF_EVENT_CONT;
-}
-
-static void test_get_stack_raw_tp(void)
-{
-	const char *file = "./test_get_stack_rawtp.o";
-	int i, efd, err, prog_fd, pmu_fd, perfmap_fd;
-	struct perf_event_attr attr = {};
-	struct timespec tv = {0, 10};
-	__u32 key = 0, duration = 0;
-	struct bpf_object *obj;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
-		return;
-
-	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
-	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
-		goto close_prog;
-
-	perfmap_fd = bpf_find_map(__func__, obj, "perfmap");
-	if (CHECK(perfmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
-		  perfmap_fd, errno))
-		goto close_prog;
-
-	err = load_kallsyms();
-	if (CHECK(err < 0, "load_kallsyms", "err %d errno %d\n", err, errno))
-		goto close_prog;
-
-	attr.sample_type = PERF_SAMPLE_RAW;
-	attr.type = PERF_TYPE_SOFTWARE;
-	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
-	pmu_fd = syscall(__NR_perf_event_open, &attr, getpid()/*pid*/, -1/*cpu*/,
-			 -1/*group_fd*/, 0);
-	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
-		  errno))
-		goto close_prog;
-
-	err = bpf_map_update_elem(perfmap_fd, &key, &pmu_fd, BPF_ANY);
-	if (CHECK(err < 0, "bpf_map_update_elem", "err %d errno %d\n", err,
-		  errno))
-		goto close_prog;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
-	if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
-		  err, errno))
-		goto close_prog;
-
-	err = perf_event_mmap(pmu_fd);
-	if (CHECK(err < 0, "perf_event_mmap", "err %d errno %d\n", err, errno))
-		goto close_prog;
-
-	/* trigger some syscall action */
-	for (i = 0; i < MAX_CNT_RAWTP; i++)
-		nanosleep(&tv, NULL);
-
-	err = perf_event_poller(pmu_fd, get_stack_print_output);
-	if (CHECK(err < 0, "perf_event_poller", "err %d errno %d\n", err, errno))
-		goto close_prog;
-
-	goto close_prog_noerr;
-close_prog:
-	error_cnt++;
-close_prog_noerr:
-	bpf_object__close(obj);
-}
-
-static void test_task_fd_query_rawtp(void)
-{
-	const char *file = "./test_get_stack_rawtp.o";
-	__u64 probe_offset, probe_addr;
-	__u32 len, prog_id, fd_type;
-	struct bpf_object *obj;
-	int efd, err, prog_fd;
-	__u32 duration = 0;
-	char buf[256];
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
-		return;
-
-	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
-	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
-		goto close_prog;
-
-	/* query (getpid(), efd) */
-	len = sizeof(buf);
-	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
-				&fd_type, &probe_offset, &probe_addr);
-	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
-		  errno))
-		goto close_prog;
-
-	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
-	      strcmp(buf, "sys_enter") == 0;
-	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
-		  fd_type, buf))
-		goto close_prog;
-
-	/* test zero len */
-	len = 0;
-	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
-				&fd_type, &probe_offset, &probe_addr);
-	if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n",
-		  err, errno))
-		goto close_prog;
-	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
-	      len == strlen("sys_enter");
-	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
-		goto close_prog;
-
-	/* test empty buffer */
-	len = sizeof(buf);
-	err = bpf_task_fd_query(getpid(), efd, 0, 0, &len, &prog_id,
-				&fd_type, &probe_offset, &probe_addr);
-	if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n",
-		  err, errno))
-		goto close_prog;
-	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
-	      len == strlen("sys_enter");
-	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
-		goto close_prog;
-
-	/* test smaller buffer */
-	len = 3;
-	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
-				&fd_type, &probe_offset, &probe_addr);
-	if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 3)",
-		  "err %d errno %d\n", err, errno))
-		goto close_prog;
-	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
-	      len == strlen("sys_enter") &&
-	      strcmp(buf, "sy") == 0;
-	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
-		goto close_prog;
-
-	goto close_prog_noerr;
-close_prog:
-	error_cnt++;
-close_prog_noerr:
-	bpf_object__close(obj);
-}
-
-static void test_task_fd_query_tp_core(const char *probe_name,
-				       const char *tp_name)
-{
-	const char *file = "./test_tracepoint.o";
-	int err, bytes, efd, prog_fd, pmu_fd;
-	struct perf_event_attr attr = {};
-	__u64 probe_offset, probe_addr;
-	__u32 len, prog_id, fd_type;
-	struct bpf_object *obj;
-	__u32 duration = 0;
-	char buf[256];
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
-	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
-		goto close_prog;
-
-	snprintf(buf, sizeof(buf),
-		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
-	efd = open(buf, O_RDONLY, 0);
-	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
-		goto close_prog;
-	bytes = read(efd, buf, sizeof(buf));
-	close(efd);
-	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
-		  "bytes %d errno %d\n", bytes, errno))
-		goto close_prog;
-
-	attr.config = strtol(buf, NULL, 0);
-	attr.type = PERF_TYPE_TRACEPOINT;
-	attr.sample_type = PERF_SAMPLE_RAW;
-	attr.sample_period = 1;
-	attr.wakeup_events = 1;
-	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
-			 0 /* cpu 0 */, -1 /* group id */,
-			 0 /* flags */);
-	if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
-		goto close_pmu;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
-	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
-		  errno))
-		goto close_pmu;
-
-	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
-	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
-		  errno))
-		goto close_pmu;
-
-	/* query (getpid(), pmu_fd) */
-	len = sizeof(buf);
-	err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, &len, &prog_id,
-				&fd_type, &probe_offset, &probe_addr);
-	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
-		  errno))
-		goto close_pmu;
-
-	err = (fd_type == BPF_FD_TYPE_TRACEPOINT) && !strcmp(buf, tp_name);
-	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
-		  fd_type, buf))
-		goto close_pmu;
-
-	close(pmu_fd);
-	goto close_prog_noerr;
-
-close_pmu:
-	close(pmu_fd);
-close_prog:
-	error_cnt++;
-close_prog_noerr:
-	bpf_object__close(obj);
-}
-
-static void test_task_fd_query_tp(void)
-{
-	test_task_fd_query_tp_core("sched/sched_switch",
-				   "sched_switch");
-	test_task_fd_query_tp_core("syscalls/sys_enter_read",
-				   "sys_enter_read");
-}
-
 static int libbpf_debug_print(enum libbpf_print_level level,
 			      const char *format, va_list args)
 {
@@ -1423,10 +1000,6 @@ int main(void)
 	test_tcp_estats();
 	test_bpf_obj_id();
 	test_obj_name();
-	test_tp_attach_query();
-	test_get_stack_raw_tp();
-	test_task_fd_query_rawtp();
-	test_task_fd_query_tp();
 	test_reference_tracking();
 	test_queue_stack_map(QUEUE);
 	test_queue_stack_map(STACK);
-- 
cgit v1.2.3-59-g8ed1b


From 271a6337cbaedfa9b93006326d0ecc28fd3dc71c Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 1 Mar 2019 19:42:18 -0800
Subject: selftests: bpf: break up test_progs - spinlock

Move spinlock prog tests into separate files.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/map_lock.c |  75 ++++++++++++++++
 tools/testing/selftests/bpf/prog_tests/spinlock.c |  29 ++++++
 tools/testing/selftests/bpf/test_progs.c          | 104 +---------------------
 tools/testing/selftests/bpf/test_progs.h          |   1 +
 4 files changed, 106 insertions(+), 103 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/map_lock.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/spinlock.c

diff --git a/tools/testing/selftests/bpf/prog_tests/map_lock.c b/tools/testing/selftests/bpf/prog_tests/map_lock.c
new file mode 100644
index 000000000000..90f8a206340a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/map_lock.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+static void *parallel_map_access(void *arg)
+{
+	int err, map_fd = *(u32 *) arg;
+	int vars[17], i, j, rnd, key = 0;
+
+	for (i = 0; i < 10000; i++) {
+		err = bpf_map_lookup_elem_flags(map_fd, &key, vars, BPF_F_LOCK);
+		if (err) {
+			printf("lookup failed\n");
+			error_cnt++;
+			goto out;
+		}
+		if (vars[0] != 0) {
+			printf("lookup #%d var[0]=%d\n", i, vars[0]);
+			error_cnt++;
+			goto out;
+		}
+		rnd = vars[1];
+		for (j = 2; j < 17; j++) {
+			if (vars[j] == rnd)
+				continue;
+			printf("lookup #%d var[1]=%d var[%d]=%d\n",
+			       i, rnd, j, vars[j]);
+			error_cnt++;
+			goto out;
+		}
+	}
+out:
+	pthread_exit(arg);
+}
+
+void test_map_lock(void)
+{
+	const char *file = "./test_map_lock.o";
+	int prog_fd, map_fd[2], vars[17] = {};
+	pthread_t thread_id[6];
+	struct bpf_object *obj;
+	int err = 0, key = 0, i;
+	void *ret;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
+	if (err) {
+		printf("test_map_lock:bpf_prog_load errno %d\n", errno);
+		goto close_prog;
+	}
+	map_fd[0] = bpf_find_map(__func__, obj, "hash_map");
+	if (map_fd[0] < 0)
+		goto close_prog;
+	map_fd[1] = bpf_find_map(__func__, obj, "array_map");
+	if (map_fd[1] < 0)
+		goto close_prog;
+
+	bpf_map_update_elem(map_fd[0], &key, vars, BPF_F_LOCK);
+
+	for (i = 0; i < 4; i++)
+		assert(pthread_create(&thread_id[i], NULL,
+				      &spin_lock_thread, &prog_fd) == 0);
+	for (i = 4; i < 6; i++)
+		assert(pthread_create(&thread_id[i], NULL,
+				      &parallel_map_access, &map_fd[i - 4]) == 0);
+	for (i = 0; i < 4; i++)
+		assert(pthread_join(thread_id[i], &ret) == 0 &&
+		       ret == (void *)&prog_fd);
+	for (i = 4; i < 6; i++)
+		assert(pthread_join(thread_id[i], &ret) == 0 &&
+		       ret == (void *)&map_fd[i - 4]);
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/spinlock.c b/tools/testing/selftests/bpf/prog_tests/spinlock.c
new file mode 100644
index 000000000000..9a573a9675d7
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/spinlock.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_spinlock(void)
+{
+	const char *file = "./test_spin_lock.o";
+	pthread_t thread_id[4];
+	struct bpf_object *obj;
+	int prog_fd;
+	int err = 0, i;
+	void *ret;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
+	if (err) {
+		printf("test_spin_lock:bpf_prog_load errno %d\n", errno);
+		goto close_prog;
+	}
+	for (i = 0; i < 4; i++)
+		assert(pthread_create(&thread_id[i], NULL,
+				      &spin_lock_thread, &prog_fd) == 0);
+	for (i = 0; i < 4; i++)
+		assert(pthread_join(thread_id[i], &ret) == 0 &&
+		       ret == (void *)&prog_fd);
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 6e41dfab1e75..e97b5b0eaf27 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -829,7 +829,7 @@ static void test_flow_dissector(void)
 	bpf_object__close(obj);
 }
 
-static void *test_spin_lock(void *arg)
+void *spin_lock_thread(void *arg)
 {
 	__u32 duration, retval;
 	int err, prog_fd = *(u32 *) arg;
@@ -842,106 +842,6 @@ static void *test_spin_lock(void *arg)
 	pthread_exit(arg);
 }
 
-static void test_spinlock(void)
-{
-	const char *file = "./test_spin_lock.o";
-	pthread_t thread_id[4];
-	struct bpf_object *obj;
-	int prog_fd;
-	int err = 0, i;
-	void *ret;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
-	if (err) {
-		printf("test_spin_lock:bpf_prog_load errno %d\n", errno);
-		goto close_prog;
-	}
-	for (i = 0; i < 4; i++)
-		assert(pthread_create(&thread_id[i], NULL,
-				      &test_spin_lock, &prog_fd) == 0);
-	for (i = 0; i < 4; i++)
-		assert(pthread_join(thread_id[i], &ret) == 0 &&
-		       ret == (void *)&prog_fd);
-	goto close_prog_noerr;
-close_prog:
-	error_cnt++;
-close_prog_noerr:
-	bpf_object__close(obj);
-}
-
-static void *parallel_map_access(void *arg)
-{
-	int err, map_fd = *(u32 *) arg;
-	int vars[17], i, j, rnd, key = 0;
-
-	for (i = 0; i < 10000; i++) {
-		err = bpf_map_lookup_elem_flags(map_fd, &key, vars, BPF_F_LOCK);
-		if (err) {
-			printf("lookup failed\n");
-			error_cnt++;
-			goto out;
-		}
-		if (vars[0] != 0) {
-			printf("lookup #%d var[0]=%d\n", i, vars[0]);
-			error_cnt++;
-			goto out;
-		}
-		rnd = vars[1];
-		for (j = 2; j < 17; j++) {
-			if (vars[j] == rnd)
-				continue;
-			printf("lookup #%d var[1]=%d var[%d]=%d\n",
-			       i, rnd, j, vars[j]);
-			error_cnt++;
-			goto out;
-		}
-	}
-out:
-	pthread_exit(arg);
-}
-
-static void test_map_lock(void)
-{
-	const char *file = "./test_map_lock.o";
-	int prog_fd, map_fd[2], vars[17] = {};
-	pthread_t thread_id[6];
-	struct bpf_object *obj;
-	int err = 0, key = 0, i;
-	void *ret;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
-	if (err) {
-		printf("test_map_lock:bpf_prog_load errno %d\n", errno);
-		goto close_prog;
-	}
-	map_fd[0] = bpf_find_map(__func__, obj, "hash_map");
-	if (map_fd[0] < 0)
-		goto close_prog;
-	map_fd[1] = bpf_find_map(__func__, obj, "array_map");
-	if (map_fd[1] < 0)
-		goto close_prog;
-
-	bpf_map_update_elem(map_fd[0], &key, vars, BPF_F_LOCK);
-
-	for (i = 0; i < 4; i++)
-		assert(pthread_create(&thread_id[i], NULL,
-				      &test_spin_lock, &prog_fd) == 0);
-	for (i = 4; i < 6; i++)
-		assert(pthread_create(&thread_id[i], NULL,
-				      &parallel_map_access, &map_fd[i - 4]) == 0);
-	for (i = 0; i < 4; i++)
-		assert(pthread_join(thread_id[i], &ret) == 0 &&
-		       ret == (void *)&prog_fd);
-	for (i = 4; i < 6; i++)
-		assert(pthread_join(thread_id[i], &ret) == 0 &&
-		       ret == (void *)&map_fd[i - 4]);
-	goto close_prog_noerr;
-close_prog:
-	error_cnt++;
-close_prog_noerr:
-	bpf_object__close(obj);
-}
-
 static void sigalrm_handler(int s) {}
 static struct sigaction sigalrm_action = {
 	.sa_handler = sigalrm_handler,
@@ -1004,8 +904,6 @@ int main(void)
 	test_queue_stack_map(QUEUE);
 	test_queue_stack_map(STACK);
 	test_flow_dissector();
-	test_spinlock();
-	test_map_lock();
 	test_signal_pending(BPF_PROG_TYPE_SOCKET_FILTER);
 	test_signal_pending(BPF_PROG_TYPE_FLOW_DISSECTOR);
 
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 148b5494ed08..66309327b4f8 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -85,3 +85,4 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name);
 int compare_map_keys(int map1_fd, int map2_fd);
 int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len);
 int extract_build_id(char *build_id, size_t size);
+void *spin_lock_thread(void *arg);
-- 
cgit v1.2.3-59-g8ed1b


From 886225bb0868fcf763bea62d18e57e4b5ef08a42 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Fri, 1 Mar 2019 19:42:19 -0800
Subject: selftests: bpf: break up test_progs - misc

Move the rest of prog tests into separate files.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/bpf_obj_id.c  | 249 +++++++
 .../selftests/bpf/prog_tests/flow_dissector.c      |  72 ++
 tools/testing/selftests/bpf/prog_tests/l4lb_all.c  |  90 +++
 tools/testing/selftests/bpf/prog_tests/obj_name.c  |  71 ++
 .../selftests/bpf/prog_tests/prog_run_xattr.c      |  49 ++
 .../selftests/bpf/prog_tests/queue_stack_map.c     | 103 +++
 .../selftests/bpf/prog_tests/reference_tracking.c  |  48 ++
 .../selftests/bpf/prog_tests/signal_pending.c      |  48 ++
 .../testing/selftests/bpf/prog_tests/tcp_estats.c  |  19 +
 tools/testing/selftests/bpf/test_progs.c           | 735 ---------------------
 tools/testing/selftests/bpf/test_progs.h           |   5 +
 11 files changed, 754 insertions(+), 735 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/flow_dissector.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/l4lb_all.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/obj_name.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/reference_tracking.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/signal_pending.c
 create mode 100644 tools/testing/selftests/bpf/prog_tests/tcp_estats.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
new file mode 100644
index 000000000000..a64f7a02139c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_bpf_obj_id(void)
+{
+	const __u64 array_magic_value = 0xfaceb00c;
+	const __u32 array_key = 0;
+	const int nr_iters = 2;
+	const char *file = "./test_obj_id.o";
+	const char *expected_prog_name = "test_obj_id";
+	const char *expected_map_name = "test_map_id";
+	const __u64 nsec_per_sec = 1000000000;
+
+	struct bpf_object *objs[nr_iters];
+	int prog_fds[nr_iters], map_fds[nr_iters];
+	/* +1 to test for the info_len returned by kernel */
+	struct bpf_prog_info prog_infos[nr_iters + 1];
+	struct bpf_map_info map_infos[nr_iters + 1];
+	/* Each prog only uses one map. +1 to test nr_map_ids
+	 * returned by kernel.
+	 */
+	__u32 map_ids[nr_iters + 1];
+	char jited_insns[128], xlated_insns[128], zeros[128];
+	__u32 i, next_id, info_len, nr_id_found, duration = 0;
+	struct timespec real_time_ts, boot_time_ts;
+	int err = 0;
+	__u64 array_value;
+	uid_t my_uid = getuid();
+	time_t now, load_time;
+
+	err = bpf_prog_get_fd_by_id(0);
+	CHECK(err >= 0 || errno != ENOENT,
+	      "get-fd-by-notexist-prog-id", "err %d errno %d\n", err, errno);
+
+	err = bpf_map_get_fd_by_id(0);
+	CHECK(err >= 0 || errno != ENOENT,
+	      "get-fd-by-notexist-map-id", "err %d errno %d\n", err, errno);
+
+	for (i = 0; i < nr_iters; i++)
+		objs[i] = NULL;
+
+	/* Check bpf_obj_get_info_by_fd() */
+	bzero(zeros, sizeof(zeros));
+	for (i = 0; i < nr_iters; i++) {
+		now = time(NULL);
+		err = bpf_prog_load(file, BPF_PROG_TYPE_SOCKET_FILTER,
+				    &objs[i], &prog_fds[i]);
+		/* test_obj_id.o is a dumb prog. It should never fail
+		 * to load.
+		 */
+		if (err)
+			error_cnt++;
+		assert(!err);
+
+		/* Insert a magic value to the map */
+		map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
+		assert(map_fds[i] >= 0);
+		err = bpf_map_update_elem(map_fds[i], &array_key,
+					  &array_magic_value, 0);
+		assert(!err);
+
+		/* Check getting map info */
+		info_len = sizeof(struct bpf_map_info) * 2;
+		bzero(&map_infos[i], info_len);
+		err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i],
+					     &info_len);
+		if (CHECK(err ||
+			  map_infos[i].type != BPF_MAP_TYPE_ARRAY ||
+			  map_infos[i].key_size != sizeof(__u32) ||
+			  map_infos[i].value_size != sizeof(__u64) ||
+			  map_infos[i].max_entries != 1 ||
+			  map_infos[i].map_flags != 0 ||
+			  info_len != sizeof(struct bpf_map_info) ||
+			  strcmp((char *)map_infos[i].name, expected_map_name),
+			  "get-map-info(fd)",
+			  "err %d errno %d type %d(%d) info_len %u(%Zu) key_size %u value_size %u max_entries %u map_flags %X name %s(%s)\n",
+			  err, errno,
+			  map_infos[i].type, BPF_MAP_TYPE_ARRAY,
+			  info_len, sizeof(struct bpf_map_info),
+			  map_infos[i].key_size,
+			  map_infos[i].value_size,
+			  map_infos[i].max_entries,
+			  map_infos[i].map_flags,
+			  map_infos[i].name, expected_map_name))
+			goto done;
+
+		/* Check getting prog info */
+		info_len = sizeof(struct bpf_prog_info) * 2;
+		bzero(&prog_infos[i], info_len);
+		bzero(jited_insns, sizeof(jited_insns));
+		bzero(xlated_insns, sizeof(xlated_insns));
+		prog_infos[i].jited_prog_insns = ptr_to_u64(jited_insns);
+		prog_infos[i].jited_prog_len = sizeof(jited_insns);
+		prog_infos[i].xlated_prog_insns = ptr_to_u64(xlated_insns);
+		prog_infos[i].xlated_prog_len = sizeof(xlated_insns);
+		prog_infos[i].map_ids = ptr_to_u64(map_ids + i);
+		prog_infos[i].nr_map_ids = 2;
+		err = clock_gettime(CLOCK_REALTIME, &real_time_ts);
+		assert(!err);
+		err = clock_gettime(CLOCK_BOOTTIME, &boot_time_ts);
+		assert(!err);
+		err = bpf_obj_get_info_by_fd(prog_fds[i], &prog_infos[i],
+					     &info_len);
+		load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec)
+			+ (prog_infos[i].load_time / nsec_per_sec);
+		if (CHECK(err ||
+			  prog_infos[i].type != BPF_PROG_TYPE_SOCKET_FILTER ||
+			  info_len != sizeof(struct bpf_prog_info) ||
+			  (jit_enabled && !prog_infos[i].jited_prog_len) ||
+			  (jit_enabled &&
+			   !memcmp(jited_insns, zeros, sizeof(zeros))) ||
+			  !prog_infos[i].xlated_prog_len ||
+			  !memcmp(xlated_insns, zeros, sizeof(zeros)) ||
+			  load_time < now - 60 || load_time > now + 60 ||
+			  prog_infos[i].created_by_uid != my_uid ||
+			  prog_infos[i].nr_map_ids != 1 ||
+			  *(int *)(long)prog_infos[i].map_ids != map_infos[i].id ||
+			  strcmp((char *)prog_infos[i].name, expected_prog_name),
+			  "get-prog-info(fd)",
+			  "err %d errno %d i %d type %d(%d) info_len %u(%Zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n",
+			  err, errno, i,
+			  prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER,
+			  info_len, sizeof(struct bpf_prog_info),
+			  jit_enabled,
+			  prog_infos[i].jited_prog_len,
+			  prog_infos[i].xlated_prog_len,
+			  !!memcmp(jited_insns, zeros, sizeof(zeros)),
+			  !!memcmp(xlated_insns, zeros, sizeof(zeros)),
+			  load_time, now,
+			  prog_infos[i].created_by_uid, my_uid,
+			  prog_infos[i].nr_map_ids, 1,
+			  *(int *)(long)prog_infos[i].map_ids, map_infos[i].id,
+			  prog_infos[i].name, expected_prog_name))
+			goto done;
+	}
+
+	/* Check bpf_prog_get_next_id() */
+	nr_id_found = 0;
+	next_id = 0;
+	while (!bpf_prog_get_next_id(next_id, &next_id)) {
+		struct bpf_prog_info prog_info = {};
+		__u32 saved_map_id;
+		int prog_fd;
+
+		info_len = sizeof(prog_info);
+
+		prog_fd = bpf_prog_get_fd_by_id(next_id);
+		if (prog_fd < 0 && errno == ENOENT)
+			/* The bpf_prog is in the dead row */
+			continue;
+		if (CHECK(prog_fd < 0, "get-prog-fd(next_id)",
+			  "prog_fd %d next_id %d errno %d\n",
+			  prog_fd, next_id, errno))
+			break;
+
+		for (i = 0; i < nr_iters; i++)
+			if (prog_infos[i].id == next_id)
+				break;
+
+		if (i == nr_iters)
+			continue;
+
+		nr_id_found++;
+
+		/* Negative test:
+		 * prog_info.nr_map_ids = 1
+		 * prog_info.map_ids = NULL
+		 */
+		prog_info.nr_map_ids = 1;
+		err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
+		if (CHECK(!err || errno != EFAULT,
+			  "get-prog-fd-bad-nr-map-ids", "err %d errno %d(%d)",
+			  err, errno, EFAULT))
+			break;
+		bzero(&prog_info, sizeof(prog_info));
+		info_len = sizeof(prog_info);
+
+		saved_map_id = *(int *)((long)prog_infos[i].map_ids);
+		prog_info.map_ids = prog_infos[i].map_ids;
+		prog_info.nr_map_ids = 2;
+		err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
+		prog_infos[i].jited_prog_insns = 0;
+		prog_infos[i].xlated_prog_insns = 0;
+		CHECK(err || info_len != sizeof(struct bpf_prog_info) ||
+		      memcmp(&prog_info, &prog_infos[i], info_len) ||
+		      *(int *)(long)prog_info.map_ids != saved_map_id,
+		      "get-prog-info(next_id->fd)",
+		      "err %d errno %d info_len %u(%Zu) memcmp %d map_id %u(%u)\n",
+		      err, errno, info_len, sizeof(struct bpf_prog_info),
+		      memcmp(&prog_info, &prog_infos[i], info_len),
+		      *(int *)(long)prog_info.map_ids, saved_map_id);
+		close(prog_fd);
+	}
+	CHECK(nr_id_found != nr_iters,
+	      "check total prog id found by get_next_id",
+	      "nr_id_found %u(%u)\n",
+	      nr_id_found, nr_iters);
+
+	/* Check bpf_map_get_next_id() */
+	nr_id_found = 0;
+	next_id = 0;
+	while (!bpf_map_get_next_id(next_id, &next_id)) {
+		struct bpf_map_info map_info = {};
+		int map_fd;
+
+		info_len = sizeof(map_info);
+
+		map_fd = bpf_map_get_fd_by_id(next_id);
+		if (map_fd < 0 && errno == ENOENT)
+			/* The bpf_map is in the dead row */
+			continue;
+		if (CHECK(map_fd < 0, "get-map-fd(next_id)",
+			  "map_fd %d next_id %u errno %d\n",
+			  map_fd, next_id, errno))
+			break;
+
+		for (i = 0; i < nr_iters; i++)
+			if (map_infos[i].id == next_id)
+				break;
+
+		if (i == nr_iters)
+			continue;
+
+		nr_id_found++;
+
+		err = bpf_map_lookup_elem(map_fd, &array_key, &array_value);
+		assert(!err);
+
+		err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
+		CHECK(err || info_len != sizeof(struct bpf_map_info) ||
+		      memcmp(&map_info, &map_infos[i], info_len) ||
+		      array_value != array_magic_value,
+		      "check get-map-info(next_id->fd)",
+		      "err %d errno %d info_len %u(%Zu) memcmp %d array_value %llu(%llu)\n",
+		      err, errno, info_len, sizeof(struct bpf_map_info),
+		      memcmp(&map_info, &map_infos[i], info_len),
+		      array_value, array_magic_value);
+
+		close(map_fd);
+	}
+	CHECK(nr_id_found != nr_iters,
+	      "check total map id found by get_next_id",
+	      "nr_id_found %u(%u)\n",
+	      nr_id_found, nr_iters);
+
+done:
+	for (i = 0; i < nr_iters; i++)
+		bpf_object__close(objs[i]);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
new file mode 100644
index 000000000000..bcbd928c96ab
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+#define CHECK_FLOW_KEYS(desc, got, expected)				\
+	CHECK(memcmp(&got, &expected, sizeof(got)) != 0,		\
+	      desc,							\
+	      "nhoff=%u/%u "						\
+	      "thoff=%u/%u "						\
+	      "addr_proto=0x%x/0x%x "					\
+	      "is_frag=%u/%u "						\
+	      "is_first_frag=%u/%u "					\
+	      "is_encap=%u/%u "						\
+	      "n_proto=0x%x/0x%x "					\
+	      "sport=%u/%u "						\
+	      "dport=%u/%u\n",						\
+	      got.nhoff, expected.nhoff,				\
+	      got.thoff, expected.thoff,				\
+	      got.addr_proto, expected.addr_proto,			\
+	      got.is_frag, expected.is_frag,				\
+	      got.is_first_frag, expected.is_first_frag,		\
+	      got.is_encap, expected.is_encap,				\
+	      got.n_proto, expected.n_proto,				\
+	      got.sport, expected.sport,				\
+	      got.dport, expected.dport)
+
+static struct bpf_flow_keys pkt_v4_flow_keys = {
+	.nhoff = 0,
+	.thoff = sizeof(struct iphdr),
+	.addr_proto = ETH_P_IP,
+	.ip_proto = IPPROTO_TCP,
+	.n_proto = __bpf_constant_htons(ETH_P_IP),
+};
+
+static struct bpf_flow_keys pkt_v6_flow_keys = {
+	.nhoff = 0,
+	.thoff = sizeof(struct ipv6hdr),
+	.addr_proto = ETH_P_IPV6,
+	.ip_proto = IPPROTO_TCP,
+	.n_proto = __bpf_constant_htons(ETH_P_IPV6),
+};
+
+void test_flow_dissector(void)
+{
+	struct bpf_flow_keys flow_keys;
+	struct bpf_object *obj;
+	__u32 duration, retval;
+	int err, prog_fd;
+	__u32 size;
+
+	err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector",
+			    "jmp_table", &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	err = bpf_prog_test_run(prog_fd, 10, &pkt_v4, sizeof(pkt_v4),
+				&flow_keys, &size, &retval, &duration);
+	CHECK(size != sizeof(flow_keys) || err || retval != 1, "ipv4",
+	      "err %d errno %d retval %d duration %d size %u/%lu\n",
+	      err, errno, retval, duration, size, sizeof(flow_keys));
+	CHECK_FLOW_KEYS("ipv4_flow_keys", flow_keys, pkt_v4_flow_keys);
+
+	err = bpf_prog_test_run(prog_fd, 10, &pkt_v6, sizeof(pkt_v6),
+				&flow_keys, &size, &retval, &duration);
+	CHECK(size != sizeof(flow_keys) || err || retval != 1, "ipv6",
+	      "err %d errno %d retval %d duration %d size %u/%lu\n",
+	      err, errno, retval, duration, size, sizeof(flow_keys));
+	CHECK_FLOW_KEYS("ipv6_flow_keys", flow_keys, pkt_v6_flow_keys);
+
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
new file mode 100644
index 000000000000..20ddca830e68
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+static void test_l4lb(const char *file)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct vip key = {.protocol = 6};
+	struct vip_meta {
+		__u32 flags;
+		__u32 vip_num;
+	} value = {.vip_num = VIP_NUM};
+	__u32 stats_key = VIP_NUM;
+	struct vip_stats {
+		__u64 bytes;
+		__u64 pkts;
+	} stats[nr_cpus];
+	struct real_definition {
+		union {
+			__be32 dst;
+			__be32 dstv6[4];
+		};
+		__u8 flags;
+	} real_def = {.dst = MAGIC_VAL};
+	__u32 ch_key = 11, real_num = 3;
+	__u32 duration, retval, size;
+	int err, i, prog_fd, map_fd;
+	__u64 bytes = 0, pkts = 0;
+	struct bpf_object *obj;
+	char buf[128];
+	u32 *magic = (u32 *)buf;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	map_fd = bpf_find_map(__func__, obj, "vip_map");
+	if (map_fd < 0)
+		goto out;
+	bpf_map_update_elem(map_fd, &key, &value, 0);
+
+	map_fd = bpf_find_map(__func__, obj, "ch_rings");
+	if (map_fd < 0)
+		goto out;
+	bpf_map_update_elem(map_fd, &ch_key, &real_num, 0);
+
+	map_fd = bpf_find_map(__func__, obj, "reals");
+	if (map_fd < 0)
+		goto out;
+	bpf_map_update_elem(map_fd, &real_num, &real_def, 0);
+
+	err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
+				buf, &size, &retval, &duration);
+	CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 ||
+	      *magic != MAGIC_VAL, "ipv4",
+	      "err %d errno %d retval %d size %d magic %x\n",
+	      err, errno, retval, size, *magic);
+
+	err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
+				buf, &size, &retval, &duration);
+	CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 ||
+	      *magic != MAGIC_VAL, "ipv6",
+	      "err %d errno %d retval %d size %d magic %x\n",
+	      err, errno, retval, size, *magic);
+
+	map_fd = bpf_find_map(__func__, obj, "stats");
+	if (map_fd < 0)
+		goto out;
+	bpf_map_lookup_elem(map_fd, &stats_key, stats);
+	for (i = 0; i < nr_cpus; i++) {
+		bytes += stats[i].bytes;
+		pkts += stats[i].pkts;
+	}
+	if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
+		error_cnt++;
+		printf("test_l4lb:FAIL:stats %lld %lld\n", bytes, pkts);
+	}
+out:
+	bpf_object__close(obj);
+}
+
+void test_l4lb_all(void)
+{
+	const char *file1 = "./test_l4lb.o";
+	const char *file2 = "./test_l4lb_noinline.o";
+
+	test_l4lb(file1);
+	test_l4lb(file2);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/obj_name.c b/tools/testing/selftests/bpf/prog_tests/obj_name.c
new file mode 100644
index 000000000000..e178416bddad
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/obj_name.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_obj_name(void)
+{
+	struct {
+		const char *name;
+		int success;
+		int expected_errno;
+	} tests[] = {
+		{ "", 1, 0 },
+		{ "_123456789ABCDE", 1, 0 },
+		{ "_123456789ABCDEF", 0, EINVAL },
+		{ "_123456789ABCD\n", 0, EINVAL },
+	};
+	struct bpf_insn prog[] = {
+		BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	};
+	__u32 duration = 0;
+	int i;
+
+	for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) {
+		size_t name_len = strlen(tests[i].name) + 1;
+		union bpf_attr attr;
+		size_t ncopy;
+		int fd;
+
+		/* test different attr.prog_name during BPF_PROG_LOAD */
+		ncopy = name_len < sizeof(attr.prog_name) ?
+			name_len : sizeof(attr.prog_name);
+		bzero(&attr, sizeof(attr));
+		attr.prog_type = BPF_PROG_TYPE_SCHED_CLS;
+		attr.insn_cnt = 2;
+		attr.insns = ptr_to_u64(prog);
+		attr.license = ptr_to_u64("");
+		memcpy(attr.prog_name, tests[i].name, ncopy);
+
+		fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+		CHECK((tests[i].success && fd < 0) ||
+		      (!tests[i].success && fd != -1) ||
+		      (!tests[i].success && errno != tests[i].expected_errno),
+		      "check-bpf-prog-name",
+		      "fd %d(%d) errno %d(%d)\n",
+		       fd, tests[i].success, errno, tests[i].expected_errno);
+
+		if (fd != -1)
+			close(fd);
+
+		/* test different attr.map_name during BPF_MAP_CREATE */
+		ncopy = name_len < sizeof(attr.map_name) ?
+			name_len : sizeof(attr.map_name);
+		bzero(&attr, sizeof(attr));
+		attr.map_type = BPF_MAP_TYPE_ARRAY;
+		attr.key_size = 4;
+		attr.value_size = 4;
+		attr.max_entries = 1;
+		attr.map_flags = 0;
+		memcpy(attr.map_name, tests[i].name, ncopy);
+		fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
+		CHECK((tests[i].success && fd < 0) ||
+		      (!tests[i].success && fd != -1) ||
+		      (!tests[i].success && errno != tests[i].expected_errno),
+		      "check-bpf-map-name",
+		      "fd %d(%d) errno %d(%d)\n",
+		      fd, tests[i].success, errno, tests[i].expected_errno);
+
+		if (fd != -1)
+			close(fd);
+	}
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
new file mode 100644
index 000000000000..5dd89b941f53
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_prog_run_xattr(void)
+{
+	const char *file = "./test_pkt_access.o";
+	struct bpf_object *obj;
+	char buf[10];
+	int err;
+	struct bpf_prog_test_run_attr tattr = {
+		.repeat = 1,
+		.data_in = &pkt_v4,
+		.data_size_in = sizeof(pkt_v4),
+		.data_out = buf,
+		.data_size_out = 5,
+	};
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj,
+			    &tattr.prog_fd);
+	if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+		return;
+
+	memset(buf, 0, sizeof(buf));
+
+	err = bpf_prog_test_run_xattr(&tattr);
+	CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run",
+	      "err %d errno %d retval %d\n", err, errno, tattr.retval);
+
+	CHECK_ATTR(tattr.data_size_out != sizeof(pkt_v4), "data_size_out",
+	      "incorrect output size, want %lu have %u\n",
+	      sizeof(pkt_v4), tattr.data_size_out);
+
+	CHECK_ATTR(buf[5] != 0, "overflow",
+	      "BPF_PROG_TEST_RUN ignored size hint\n");
+
+	tattr.data_out = NULL;
+	tattr.data_size_out = 0;
+	errno = 0;
+
+	err = bpf_prog_test_run_xattr(&tattr);
+	CHECK_ATTR(err || errno || tattr.retval, "run_no_output",
+	      "err %d errno %d retval %d\n", err, errno, tattr.retval);
+
+	tattr.data_size_out = 1;
+	err = bpf_prog_test_run_xattr(&tattr);
+	CHECK_ATTR(err != -EINVAL, "run_wrong_size_out", "err %d\n", err);
+
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
new file mode 100644
index 000000000000..e60cd5ff1f55
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+enum {
+	QUEUE,
+	STACK,
+};
+
+static void test_queue_stack_map_by_type(int type)
+{
+	const int MAP_SIZE = 32;
+	__u32 vals[MAP_SIZE], duration, retval, size, val;
+	int i, err, prog_fd, map_in_fd, map_out_fd;
+	char file[32], buf[128];
+	struct bpf_object *obj;
+	struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+
+	/* Fill test values to be used */
+	for (i = 0; i < MAP_SIZE; i++)
+		vals[i] = rand();
+
+	if (type == QUEUE)
+		strncpy(file, "./test_queue_map.o", sizeof(file));
+	else if (type == STACK)
+		strncpy(file, "./test_stack_map.o", sizeof(file));
+	else
+		return;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	map_in_fd = bpf_find_map(__func__, obj, "map_in");
+	if (map_in_fd < 0)
+		goto out;
+
+	map_out_fd = bpf_find_map(__func__, obj, "map_out");
+	if (map_out_fd < 0)
+		goto out;
+
+	/* Push 32 elements to the input map */
+	for (i = 0; i < MAP_SIZE; i++) {
+		err = bpf_map_update_elem(map_in_fd, NULL, &vals[i], 0);
+		if (err) {
+			error_cnt++;
+			goto out;
+		}
+	}
+
+	/* The eBPF program pushes iph.saddr in the output map,
+	 * pops the input map and saves this value in iph.daddr
+	 */
+	for (i = 0; i < MAP_SIZE; i++) {
+		if (type == QUEUE) {
+			val = vals[i];
+			pkt_v4.iph.saddr = vals[i] * 5;
+		} else if (type == STACK) {
+			val = vals[MAP_SIZE - 1 - i];
+			pkt_v4.iph.saddr = vals[MAP_SIZE - 1 - i] * 5;
+		}
+
+		err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+					buf, &size, &retval, &duration);
+		if (err || retval || size != sizeof(pkt_v4) ||
+		    iph->daddr != val)
+			break;
+	}
+
+	CHECK(err || retval || size != sizeof(pkt_v4) || iph->daddr != val,
+	      "bpf_map_pop_elem",
+	      "err %d errno %d retval %d size %d iph->daddr %u\n",
+	      err, errno, retval, size, iph->daddr);
+
+	/* Queue is empty, program should return TC_ACT_SHOT */
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				buf, &size, &retval, &duration);
+	CHECK(err || retval != 2 /* TC_ACT_SHOT */|| size != sizeof(pkt_v4),
+	      "check-queue-stack-map-empty",
+	      "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+
+	/* Check that the program pushed elements correctly */
+	for (i = 0; i < MAP_SIZE; i++) {
+		err = bpf_map_lookup_and_delete_elem(map_out_fd, NULL, &val);
+		if (err || val != vals[i] * 5)
+			break;
+	}
+
+	CHECK(i != MAP_SIZE && (err || val != vals[i] * 5),
+	      "bpf_map_push_elem", "err %d value %u\n", err, val);
+
+out:
+	pkt_v4.iph.saddr = 0;
+	bpf_object__close(obj);
+}
+
+void test_queue_stack_map(void)
+{
+	test_queue_stack_map_by_type(QUEUE);
+	test_queue_stack_map_by_type(STACK);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
new file mode 100644
index 000000000000..5633be43828f
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+static int libbpf_debug_print(enum libbpf_print_level level,
+			      const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG)
+		return 0;
+
+	return vfprintf(stderr, format, args);
+}
+
+void test_reference_tracking(void)
+{
+	const char *file = "./test_sk_lookup_kern.o";
+	struct bpf_object *obj;
+	struct bpf_program *prog;
+	__u32 duration = 0;
+	int err = 0;
+
+	obj = bpf_object__open(file);
+	if (IS_ERR(obj)) {
+		error_cnt++;
+		return;
+	}
+
+	bpf_object__for_each_program(prog, obj) {
+		const char *title;
+
+		/* Ignore .text sections */
+		title = bpf_program__title(prog, false);
+		if (strstr(title, ".text") != NULL)
+			continue;
+
+		bpf_program__set_type(prog, BPF_PROG_TYPE_SCHED_CLS);
+
+		/* Expect verifier failure if test name has 'fail' */
+		if (strstr(title, "fail") != NULL) {
+			libbpf_set_print(NULL);
+			err = !bpf_program__load(prog, "GPL", 0);
+			libbpf_set_print(libbpf_debug_print);
+		} else {
+			err = bpf_program__load(prog, "GPL", 0);
+		}
+		CHECK(err, title, "\n");
+	}
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/signal_pending.c b/tools/testing/selftests/bpf/prog_tests/signal_pending.c
new file mode 100644
index 000000000000..f2a37bbf91ab
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/signal_pending.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+static void sigalrm_handler(int s) {}
+static struct sigaction sigalrm_action = {
+	.sa_handler = sigalrm_handler,
+};
+
+static void test_signal_pending_by_type(enum bpf_prog_type prog_type)
+{
+	struct bpf_insn prog[4096];
+	struct itimerval timeo = {
+		.it_value.tv_usec = 100000, /* 100ms */
+	};
+	__u32 duration, retval;
+	int prog_fd;
+	int err;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(prog); i++)
+		prog[i] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0);
+	prog[ARRAY_SIZE(prog) - 1] = BPF_EXIT_INSN();
+
+	prog_fd = bpf_load_program(prog_type, prog, ARRAY_SIZE(prog),
+				   "GPL", 0, NULL, 0);
+	CHECK(prog_fd < 0, "test-run", "errno %d\n", errno);
+
+	err = sigaction(SIGALRM, &sigalrm_action, NULL);
+	CHECK(err, "test-run-signal-sigaction", "errno %d\n", errno);
+
+	err = setitimer(ITIMER_REAL, &timeo, NULL);
+	CHECK(err, "test-run-signal-timer", "errno %d\n", errno);
+
+	err = bpf_prog_test_run(prog_fd, 0xffffffff, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, &retval, &duration);
+	CHECK(duration > 500000000, /* 500ms */
+	      "test-run-signal-duration",
+	      "duration %dns > 500ms\n",
+	      duration);
+
+	signal(SIGALRM, SIG_DFL);
+}
+
+void test_signal_pending(enum bpf_prog_type prog_type)
+{
+	test_signal_pending_by_type(BPF_PROG_TYPE_SOCKET_FILTER);
+	test_signal_pending_by_type(BPF_PROG_TYPE_FLOW_DISSECTOR);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
new file mode 100644
index 000000000000..bb8759d69099
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_tcp_estats(void)
+{
+	const char *file = "./test_tcp_estats.o";
+	int err, prog_fd;
+	struct bpf_object *obj;
+	__u32 duration = 0;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	CHECK(err, "", "err %d errno %d\n", err, errno);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index e97b5b0eaf27..5d10aee9e277 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -40,163 +40,6 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name)
 	return bpf_map__fd(map);
 }
 
-static void test_prog_run_xattr(void)
-{
-	const char *file = "./test_pkt_access.o";
-	struct bpf_object *obj;
-	char buf[10];
-	int err;
-	struct bpf_prog_test_run_attr tattr = {
-		.repeat = 1,
-		.data_in = &pkt_v4,
-		.data_size_in = sizeof(pkt_v4),
-		.data_out = buf,
-		.data_size_out = 5,
-	};
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj,
-			    &tattr.prog_fd);
-	if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
-		return;
-
-	memset(buf, 0, sizeof(buf));
-
-	err = bpf_prog_test_run_xattr(&tattr);
-	CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run",
-	      "err %d errno %d retval %d\n", err, errno, tattr.retval);
-
-	CHECK_ATTR(tattr.data_size_out != sizeof(pkt_v4), "data_size_out",
-	      "incorrect output size, want %lu have %u\n",
-	      sizeof(pkt_v4), tattr.data_size_out);
-
-	CHECK_ATTR(buf[5] != 0, "overflow",
-	      "BPF_PROG_TEST_RUN ignored size hint\n");
-
-	tattr.data_out = NULL;
-	tattr.data_size_out = 0;
-	errno = 0;
-
-	err = bpf_prog_test_run_xattr(&tattr);
-	CHECK_ATTR(err || errno || tattr.retval, "run_no_output",
-	      "err %d errno %d retval %d\n", err, errno, tattr.retval);
-
-	tattr.data_size_out = 1;
-	err = bpf_prog_test_run_xattr(&tattr);
-	CHECK_ATTR(err != -EINVAL, "run_wrong_size_out", "err %d\n", err);
-
-	bpf_object__close(obj);
-}
-
-static void test_l4lb(const char *file)
-{
-	unsigned int nr_cpus = bpf_num_possible_cpus();
-	struct vip key = {.protocol = 6};
-	struct vip_meta {
-		__u32 flags;
-		__u32 vip_num;
-	} value = {.vip_num = VIP_NUM};
-	__u32 stats_key = VIP_NUM;
-	struct vip_stats {
-		__u64 bytes;
-		__u64 pkts;
-	} stats[nr_cpus];
-	struct real_definition {
-		union {
-			__be32 dst;
-			__be32 dstv6[4];
-		};
-		__u8 flags;
-	} real_def = {.dst = MAGIC_VAL};
-	__u32 ch_key = 11, real_num = 3;
-	__u32 duration, retval, size;
-	int err, i, prog_fd, map_fd;
-	__u64 bytes = 0, pkts = 0;
-	struct bpf_object *obj;
-	char buf[128];
-	u32 *magic = (u32 *)buf;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
-	if (err) {
-		error_cnt++;
-		return;
-	}
-
-	map_fd = bpf_find_map(__func__, obj, "vip_map");
-	if (map_fd < 0)
-		goto out;
-	bpf_map_update_elem(map_fd, &key, &value, 0);
-
-	map_fd = bpf_find_map(__func__, obj, "ch_rings");
-	if (map_fd < 0)
-		goto out;
-	bpf_map_update_elem(map_fd, &ch_key, &real_num, 0);
-
-	map_fd = bpf_find_map(__func__, obj, "reals");
-	if (map_fd < 0)
-		goto out;
-	bpf_map_update_elem(map_fd, &real_num, &real_def, 0);
-
-	err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
-				buf, &size, &retval, &duration);
-	CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 ||
-	      *magic != MAGIC_VAL, "ipv4",
-	      "err %d errno %d retval %d size %d magic %x\n",
-	      err, errno, retval, size, *magic);
-
-	err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
-				buf, &size, &retval, &duration);
-	CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 ||
-	      *magic != MAGIC_VAL, "ipv6",
-	      "err %d errno %d retval %d size %d magic %x\n",
-	      err, errno, retval, size, *magic);
-
-	map_fd = bpf_find_map(__func__, obj, "stats");
-	if (map_fd < 0)
-		goto out;
-	bpf_map_lookup_elem(map_fd, &stats_key, stats);
-	for (i = 0; i < nr_cpus; i++) {
-		bytes += stats[i].bytes;
-		pkts += stats[i].pkts;
-	}
-	if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
-		error_cnt++;
-		printf("test_l4lb:FAIL:stats %lld %lld\n", bytes, pkts);
-	}
-out:
-	bpf_object__close(obj);
-}
-
-static void test_l4lb_all(void)
-{
-	const char *file1 = "./test_l4lb.o";
-	const char *file2 = "./test_l4lb_noinline.o";
-
-	test_l4lb(file1);
-	test_l4lb(file2);
-}
-
-static void test_tcp_estats(void)
-{
-	const char *file = "./test_tcp_estats.o";
-	int err, prog_fd;
-	struct bpf_object *obj;
-	__u32 duration = 0;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
-	CHECK(err, "", "err %d errno %d\n", err, errno);
-	if (err) {
-		error_cnt++;
-		return;
-	}
-
-	bpf_object__close(obj);
-}
-
-static inline __u64 ptr_to_u64(const void *ptr)
-{
-	return (__u64) (unsigned long) ptr;
-}
-
 static bool is_jit_enabled(void)
 {
 	const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
@@ -215,322 +58,6 @@ static bool is_jit_enabled(void)
 	return enabled;
 }
 
-static void test_bpf_obj_id(void)
-{
-	const __u64 array_magic_value = 0xfaceb00c;
-	const __u32 array_key = 0;
-	const int nr_iters = 2;
-	const char *file = "./test_obj_id.o";
-	const char *expected_prog_name = "test_obj_id";
-	const char *expected_map_name = "test_map_id";
-	const __u64 nsec_per_sec = 1000000000;
-
-	struct bpf_object *objs[nr_iters];
-	int prog_fds[nr_iters], map_fds[nr_iters];
-	/* +1 to test for the info_len returned by kernel */
-	struct bpf_prog_info prog_infos[nr_iters + 1];
-	struct bpf_map_info map_infos[nr_iters + 1];
-	/* Each prog only uses one map. +1 to test nr_map_ids
-	 * returned by kernel.
-	 */
-	__u32 map_ids[nr_iters + 1];
-	char jited_insns[128], xlated_insns[128], zeros[128];
-	__u32 i, next_id, info_len, nr_id_found, duration = 0;
-	struct timespec real_time_ts, boot_time_ts;
-	int err = 0;
-	__u64 array_value;
-	uid_t my_uid = getuid();
-	time_t now, load_time;
-
-	err = bpf_prog_get_fd_by_id(0);
-	CHECK(err >= 0 || errno != ENOENT,
-	      "get-fd-by-notexist-prog-id", "err %d errno %d\n", err, errno);
-
-	err = bpf_map_get_fd_by_id(0);
-	CHECK(err >= 0 || errno != ENOENT,
-	      "get-fd-by-notexist-map-id", "err %d errno %d\n", err, errno);
-
-	for (i = 0; i < nr_iters; i++)
-		objs[i] = NULL;
-
-	/* Check bpf_obj_get_info_by_fd() */
-	bzero(zeros, sizeof(zeros));
-	for (i = 0; i < nr_iters; i++) {
-		now = time(NULL);
-		err = bpf_prog_load(file, BPF_PROG_TYPE_SOCKET_FILTER,
-				    &objs[i], &prog_fds[i]);
-		/* test_obj_id.o is a dumb prog. It should never fail
-		 * to load.
-		 */
-		if (err)
-			error_cnt++;
-		assert(!err);
-
-		/* Insert a magic value to the map */
-		map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
-		assert(map_fds[i] >= 0);
-		err = bpf_map_update_elem(map_fds[i], &array_key,
-					  &array_magic_value, 0);
-		assert(!err);
-
-		/* Check getting map info */
-		info_len = sizeof(struct bpf_map_info) * 2;
-		bzero(&map_infos[i], info_len);
-		err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i],
-					     &info_len);
-		if (CHECK(err ||
-			  map_infos[i].type != BPF_MAP_TYPE_ARRAY ||
-			  map_infos[i].key_size != sizeof(__u32) ||
-			  map_infos[i].value_size != sizeof(__u64) ||
-			  map_infos[i].max_entries != 1 ||
-			  map_infos[i].map_flags != 0 ||
-			  info_len != sizeof(struct bpf_map_info) ||
-			  strcmp((char *)map_infos[i].name, expected_map_name),
-			  "get-map-info(fd)",
-			  "err %d errno %d type %d(%d) info_len %u(%Zu) key_size %u value_size %u max_entries %u map_flags %X name %s(%s)\n",
-			  err, errno,
-			  map_infos[i].type, BPF_MAP_TYPE_ARRAY,
-			  info_len, sizeof(struct bpf_map_info),
-			  map_infos[i].key_size,
-			  map_infos[i].value_size,
-			  map_infos[i].max_entries,
-			  map_infos[i].map_flags,
-			  map_infos[i].name, expected_map_name))
-			goto done;
-
-		/* Check getting prog info */
-		info_len = sizeof(struct bpf_prog_info) * 2;
-		bzero(&prog_infos[i], info_len);
-		bzero(jited_insns, sizeof(jited_insns));
-		bzero(xlated_insns, sizeof(xlated_insns));
-		prog_infos[i].jited_prog_insns = ptr_to_u64(jited_insns);
-		prog_infos[i].jited_prog_len = sizeof(jited_insns);
-		prog_infos[i].xlated_prog_insns = ptr_to_u64(xlated_insns);
-		prog_infos[i].xlated_prog_len = sizeof(xlated_insns);
-		prog_infos[i].map_ids = ptr_to_u64(map_ids + i);
-		prog_infos[i].nr_map_ids = 2;
-		err = clock_gettime(CLOCK_REALTIME, &real_time_ts);
-		assert(!err);
-		err = clock_gettime(CLOCK_BOOTTIME, &boot_time_ts);
-		assert(!err);
-		err = bpf_obj_get_info_by_fd(prog_fds[i], &prog_infos[i],
-					     &info_len);
-		load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec)
-			+ (prog_infos[i].load_time / nsec_per_sec);
-		if (CHECK(err ||
-			  prog_infos[i].type != BPF_PROG_TYPE_SOCKET_FILTER ||
-			  info_len != sizeof(struct bpf_prog_info) ||
-			  (jit_enabled && !prog_infos[i].jited_prog_len) ||
-			  (jit_enabled &&
-			   !memcmp(jited_insns, zeros, sizeof(zeros))) ||
-			  !prog_infos[i].xlated_prog_len ||
-			  !memcmp(xlated_insns, zeros, sizeof(zeros)) ||
-			  load_time < now - 60 || load_time > now + 60 ||
-			  prog_infos[i].created_by_uid != my_uid ||
-			  prog_infos[i].nr_map_ids != 1 ||
-			  *(int *)(long)prog_infos[i].map_ids != map_infos[i].id ||
-			  strcmp((char *)prog_infos[i].name, expected_prog_name),
-			  "get-prog-info(fd)",
-			  "err %d errno %d i %d type %d(%d) info_len %u(%Zu) jit_enabled %d jited_prog_len %u xlated_prog_len %u jited_prog %d xlated_prog %d load_time %lu(%lu) uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) name %s(%s)\n",
-			  err, errno, i,
-			  prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER,
-			  info_len, sizeof(struct bpf_prog_info),
-			  jit_enabled,
-			  prog_infos[i].jited_prog_len,
-			  prog_infos[i].xlated_prog_len,
-			  !!memcmp(jited_insns, zeros, sizeof(zeros)),
-			  !!memcmp(xlated_insns, zeros, sizeof(zeros)),
-			  load_time, now,
-			  prog_infos[i].created_by_uid, my_uid,
-			  prog_infos[i].nr_map_ids, 1,
-			  *(int *)(long)prog_infos[i].map_ids, map_infos[i].id,
-			  prog_infos[i].name, expected_prog_name))
-			goto done;
-	}
-
-	/* Check bpf_prog_get_next_id() */
-	nr_id_found = 0;
-	next_id = 0;
-	while (!bpf_prog_get_next_id(next_id, &next_id)) {
-		struct bpf_prog_info prog_info = {};
-		__u32 saved_map_id;
-		int prog_fd;
-
-		info_len = sizeof(prog_info);
-
-		prog_fd = bpf_prog_get_fd_by_id(next_id);
-		if (prog_fd < 0 && errno == ENOENT)
-			/* The bpf_prog is in the dead row */
-			continue;
-		if (CHECK(prog_fd < 0, "get-prog-fd(next_id)",
-			  "prog_fd %d next_id %d errno %d\n",
-			  prog_fd, next_id, errno))
-			break;
-
-		for (i = 0; i < nr_iters; i++)
-			if (prog_infos[i].id == next_id)
-				break;
-
-		if (i == nr_iters)
-			continue;
-
-		nr_id_found++;
-
-		/* Negative test:
-		 * prog_info.nr_map_ids = 1
-		 * prog_info.map_ids = NULL
-		 */
-		prog_info.nr_map_ids = 1;
-		err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
-		if (CHECK(!err || errno != EFAULT,
-			  "get-prog-fd-bad-nr-map-ids", "err %d errno %d(%d)",
-			  err, errno, EFAULT))
-			break;
-		bzero(&prog_info, sizeof(prog_info));
-		info_len = sizeof(prog_info);
-
-		saved_map_id = *(int *)((long)prog_infos[i].map_ids);
-		prog_info.map_ids = prog_infos[i].map_ids;
-		prog_info.nr_map_ids = 2;
-		err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
-		prog_infos[i].jited_prog_insns = 0;
-		prog_infos[i].xlated_prog_insns = 0;
-		CHECK(err || info_len != sizeof(struct bpf_prog_info) ||
-		      memcmp(&prog_info, &prog_infos[i], info_len) ||
-		      *(int *)(long)prog_info.map_ids != saved_map_id,
-		      "get-prog-info(next_id->fd)",
-		      "err %d errno %d info_len %u(%Zu) memcmp %d map_id %u(%u)\n",
-		      err, errno, info_len, sizeof(struct bpf_prog_info),
-		      memcmp(&prog_info, &prog_infos[i], info_len),
-		      *(int *)(long)prog_info.map_ids, saved_map_id);
-		close(prog_fd);
-	}
-	CHECK(nr_id_found != nr_iters,
-	      "check total prog id found by get_next_id",
-	      "nr_id_found %u(%u)\n",
-	      nr_id_found, nr_iters);
-
-	/* Check bpf_map_get_next_id() */
-	nr_id_found = 0;
-	next_id = 0;
-	while (!bpf_map_get_next_id(next_id, &next_id)) {
-		struct bpf_map_info map_info = {};
-		int map_fd;
-
-		info_len = sizeof(map_info);
-
-		map_fd = bpf_map_get_fd_by_id(next_id);
-		if (map_fd < 0 && errno == ENOENT)
-			/* The bpf_map is in the dead row */
-			continue;
-		if (CHECK(map_fd < 0, "get-map-fd(next_id)",
-			  "map_fd %d next_id %u errno %d\n",
-			  map_fd, next_id, errno))
-			break;
-
-		for (i = 0; i < nr_iters; i++)
-			if (map_infos[i].id == next_id)
-				break;
-
-		if (i == nr_iters)
-			continue;
-
-		nr_id_found++;
-
-		err = bpf_map_lookup_elem(map_fd, &array_key, &array_value);
-		assert(!err);
-
-		err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
-		CHECK(err || info_len != sizeof(struct bpf_map_info) ||
-		      memcmp(&map_info, &map_infos[i], info_len) ||
-		      array_value != array_magic_value,
-		      "check get-map-info(next_id->fd)",
-		      "err %d errno %d info_len %u(%Zu) memcmp %d array_value %llu(%llu)\n",
-		      err, errno, info_len, sizeof(struct bpf_map_info),
-		      memcmp(&map_info, &map_infos[i], info_len),
-		      array_value, array_magic_value);
-
-		close(map_fd);
-	}
-	CHECK(nr_id_found != nr_iters,
-	      "check total map id found by get_next_id",
-	      "nr_id_found %u(%u)\n",
-	      nr_id_found, nr_iters);
-
-done:
-	for (i = 0; i < nr_iters; i++)
-		bpf_object__close(objs[i]);
-}
-
-static void test_obj_name(void)
-{
-	struct {
-		const char *name;
-		int success;
-		int expected_errno;
-	} tests[] = {
-		{ "", 1, 0 },
-		{ "_123456789ABCDE", 1, 0 },
-		{ "_123456789ABCDEF", 0, EINVAL },
-		{ "_123456789ABCD\n", 0, EINVAL },
-	};
-	struct bpf_insn prog[] = {
-		BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
-		BPF_EXIT_INSN(),
-	};
-	__u32 duration = 0;
-	int i;
-
-	for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) {
-		size_t name_len = strlen(tests[i].name) + 1;
-		union bpf_attr attr;
-		size_t ncopy;
-		int fd;
-
-		/* test different attr.prog_name during BPF_PROG_LOAD */
-		ncopy = name_len < sizeof(attr.prog_name) ?
-			name_len : sizeof(attr.prog_name);
-		bzero(&attr, sizeof(attr));
-		attr.prog_type = BPF_PROG_TYPE_SCHED_CLS;
-		attr.insn_cnt = 2;
-		attr.insns = ptr_to_u64(prog);
-		attr.license = ptr_to_u64("");
-		memcpy(attr.prog_name, tests[i].name, ncopy);
-
-		fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
-		CHECK((tests[i].success && fd < 0) ||
-		      (!tests[i].success && fd != -1) ||
-		      (!tests[i].success && errno != tests[i].expected_errno),
-		      "check-bpf-prog-name",
-		      "fd %d(%d) errno %d(%d)\n",
-		       fd, tests[i].success, errno, tests[i].expected_errno);
-
-		if (fd != -1)
-			close(fd);
-
-		/* test different attr.map_name during BPF_MAP_CREATE */
-		ncopy = name_len < sizeof(attr.map_name) ?
-			name_len : sizeof(attr.map_name);
-		bzero(&attr, sizeof(attr));
-		attr.map_type = BPF_MAP_TYPE_ARRAY;
-		attr.key_size = 4;
-		attr.value_size = 4;
-		attr.max_entries = 1;
-		attr.map_flags = 0;
-		memcpy(attr.map_name, tests[i].name, ncopy);
-		fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
-		CHECK((tests[i].success && fd < 0) ||
-		      (!tests[i].success && fd != -1) ||
-		      (!tests[i].success && errno != tests[i].expected_errno),
-		      "check-bpf-map-name",
-		      "fd %d(%d) errno %d(%d)\n",
-		      fd, tests[i].success, errno, tests[i].expected_errno);
-
-		if (fd != -1)
-			close(fd);
-	}
-}
-
 int compare_map_keys(int map1_fd, int map2_fd)
 {
 	__u32 key, next_key;
@@ -618,217 +145,6 @@ err:
 	return -1;
 }
 
-static int libbpf_debug_print(enum libbpf_print_level level,
-			      const char *format, va_list args)
-{
-	if (level == LIBBPF_DEBUG)
-		return 0;
-
-	return vfprintf(stderr, format, args);
-}
-
-static void test_reference_tracking()
-{
-	const char *file = "./test_sk_lookup_kern.o";
-	struct bpf_object *obj;
-	struct bpf_program *prog;
-	__u32 duration = 0;
-	int err = 0;
-
-	obj = bpf_object__open(file);
-	if (IS_ERR(obj)) {
-		error_cnt++;
-		return;
-	}
-
-	bpf_object__for_each_program(prog, obj) {
-		const char *title;
-
-		/* Ignore .text sections */
-		title = bpf_program__title(prog, false);
-		if (strstr(title, ".text") != NULL)
-			continue;
-
-		bpf_program__set_type(prog, BPF_PROG_TYPE_SCHED_CLS);
-
-		/* Expect verifier failure if test name has 'fail' */
-		if (strstr(title, "fail") != NULL) {
-			libbpf_set_print(NULL);
-			err = !bpf_program__load(prog, "GPL", 0);
-			libbpf_set_print(libbpf_debug_print);
-		} else {
-			err = bpf_program__load(prog, "GPL", 0);
-		}
-		CHECK(err, title, "\n");
-	}
-	bpf_object__close(obj);
-}
-
-enum {
-	QUEUE,
-	STACK,
-};
-
-static void test_queue_stack_map(int type)
-{
-	const int MAP_SIZE = 32;
-	__u32 vals[MAP_SIZE], duration, retval, size, val;
-	int i, err, prog_fd, map_in_fd, map_out_fd;
-	char file[32], buf[128];
-	struct bpf_object *obj;
-	struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
-
-	/* Fill test values to be used */
-	for (i = 0; i < MAP_SIZE; i++)
-		vals[i] = rand();
-
-	if (type == QUEUE)
-		strncpy(file, "./test_queue_map.o", sizeof(file));
-	else if (type == STACK)
-		strncpy(file, "./test_stack_map.o", sizeof(file));
-	else
-		return;
-
-	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
-	if (err) {
-		error_cnt++;
-		return;
-	}
-
-	map_in_fd = bpf_find_map(__func__, obj, "map_in");
-	if (map_in_fd < 0)
-		goto out;
-
-	map_out_fd = bpf_find_map(__func__, obj, "map_out");
-	if (map_out_fd < 0)
-		goto out;
-
-	/* Push 32 elements to the input map */
-	for (i = 0; i < MAP_SIZE; i++) {
-		err = bpf_map_update_elem(map_in_fd, NULL, &vals[i], 0);
-		if (err) {
-			error_cnt++;
-			goto out;
-		}
-	}
-
-	/* The eBPF program pushes iph.saddr in the output map,
-	 * pops the input map and saves this value in iph.daddr
-	 */
-	for (i = 0; i < MAP_SIZE; i++) {
-		if (type == QUEUE) {
-			val = vals[i];
-			pkt_v4.iph.saddr = vals[i] * 5;
-		} else if (type == STACK) {
-			val = vals[MAP_SIZE - 1 - i];
-			pkt_v4.iph.saddr = vals[MAP_SIZE - 1 - i] * 5;
-		}
-
-		err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-					buf, &size, &retval, &duration);
-		if (err || retval || size != sizeof(pkt_v4) ||
-		    iph->daddr != val)
-			break;
-	}
-
-	CHECK(err || retval || size != sizeof(pkt_v4) || iph->daddr != val,
-	      "bpf_map_pop_elem",
-	      "err %d errno %d retval %d size %d iph->daddr %u\n",
-	      err, errno, retval, size, iph->daddr);
-
-	/* Queue is empty, program should return TC_ACT_SHOT */
-	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
-				buf, &size, &retval, &duration);
-	CHECK(err || retval != 2 /* TC_ACT_SHOT */|| size != sizeof(pkt_v4),
-	      "check-queue-stack-map-empty",
-	      "err %d errno %d retval %d size %d\n",
-	      err, errno, retval, size);
-
-	/* Check that the program pushed elements correctly */
-	for (i = 0; i < MAP_SIZE; i++) {
-		err = bpf_map_lookup_and_delete_elem(map_out_fd, NULL, &val);
-		if (err || val != vals[i] * 5)
-			break;
-	}
-
-	CHECK(i != MAP_SIZE && (err || val != vals[i] * 5),
-	      "bpf_map_push_elem", "err %d value %u\n", err, val);
-
-out:
-	pkt_v4.iph.saddr = 0;
-	bpf_object__close(obj);
-}
-
-#define CHECK_FLOW_KEYS(desc, got, expected)				\
-	CHECK(memcmp(&got, &expected, sizeof(got)) != 0,		\
-	      desc,							\
-	      "nhoff=%u/%u "						\
-	      "thoff=%u/%u "						\
-	      "addr_proto=0x%x/0x%x "					\
-	      "is_frag=%u/%u "						\
-	      "is_first_frag=%u/%u "					\
-	      "is_encap=%u/%u "						\
-	      "n_proto=0x%x/0x%x "					\
-	      "sport=%u/%u "						\
-	      "dport=%u/%u\n",						\
-	      got.nhoff, expected.nhoff,				\
-	      got.thoff, expected.thoff,				\
-	      got.addr_proto, expected.addr_proto,			\
-	      got.is_frag, expected.is_frag,				\
-	      got.is_first_frag, expected.is_first_frag,		\
-	      got.is_encap, expected.is_encap,				\
-	      got.n_proto, expected.n_proto,				\
-	      got.sport, expected.sport,				\
-	      got.dport, expected.dport)
-
-static struct bpf_flow_keys pkt_v4_flow_keys = {
-	.nhoff = 0,
-	.thoff = sizeof(struct iphdr),
-	.addr_proto = ETH_P_IP,
-	.ip_proto = IPPROTO_TCP,
-	.n_proto = __bpf_constant_htons(ETH_P_IP),
-};
-
-static struct bpf_flow_keys pkt_v6_flow_keys = {
-	.nhoff = 0,
-	.thoff = sizeof(struct ipv6hdr),
-	.addr_proto = ETH_P_IPV6,
-	.ip_proto = IPPROTO_TCP,
-	.n_proto = __bpf_constant_htons(ETH_P_IPV6),
-};
-
-static void test_flow_dissector(void)
-{
-	struct bpf_flow_keys flow_keys;
-	struct bpf_object *obj;
-	__u32 duration, retval;
-	int err, prog_fd;
-	__u32 size;
-
-	err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector",
-			    "jmp_table", &prog_fd);
-	if (err) {
-		error_cnt++;
-		return;
-	}
-
-	err = bpf_prog_test_run(prog_fd, 10, &pkt_v4, sizeof(pkt_v4),
-				&flow_keys, &size, &retval, &duration);
-	CHECK(size != sizeof(flow_keys) || err || retval != 1, "ipv4",
-	      "err %d errno %d retval %d duration %d size %u/%lu\n",
-	      err, errno, retval, duration, size, sizeof(flow_keys));
-	CHECK_FLOW_KEYS("ipv4_flow_keys", flow_keys, pkt_v4_flow_keys);
-
-	err = bpf_prog_test_run(prog_fd, 10, &pkt_v6, sizeof(pkt_v6),
-				&flow_keys, &size, &retval, &duration);
-	CHECK(size != sizeof(flow_keys) || err || retval != 1, "ipv6",
-	      "err %d errno %d retval %d duration %d size %u/%lu\n",
-	      err, errno, retval, duration, size, sizeof(flow_keys));
-	CHECK_FLOW_KEYS("ipv6_flow_keys", flow_keys, pkt_v6_flow_keys);
-
-	bpf_object__close(obj);
-}
-
 void *spin_lock_thread(void *arg)
 {
 	__u32 duration, retval;
@@ -842,46 +158,6 @@ void *spin_lock_thread(void *arg)
 	pthread_exit(arg);
 }
 
-static void sigalrm_handler(int s) {}
-static struct sigaction sigalrm_action = {
-	.sa_handler = sigalrm_handler,
-};
-
-static void test_signal_pending(enum bpf_prog_type prog_type)
-{
-	struct bpf_insn prog[4096];
-	struct itimerval timeo = {
-		.it_value.tv_usec = 100000, /* 100ms */
-	};
-	__u32 duration, retval;
-	int prog_fd;
-	int err;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(prog); i++)
-		prog[i] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0);
-	prog[ARRAY_SIZE(prog) - 1] = BPF_EXIT_INSN();
-
-	prog_fd = bpf_load_program(prog_type, prog, ARRAY_SIZE(prog),
-				   "GPL", 0, NULL, 0);
-	CHECK(prog_fd < 0, "test-run", "errno %d\n", errno);
-
-	err = sigaction(SIGALRM, &sigalrm_action, NULL);
-	CHECK(err, "test-run-signal-sigaction", "errno %d\n", errno);
-
-	err = setitimer(ITIMER_REAL, &timeo, NULL);
-	CHECK(err, "test-run-signal-timer", "errno %d\n", errno);
-
-	err = bpf_prog_test_run(prog_fd, 0xffffffff, &pkt_v4, sizeof(pkt_v4),
-				NULL, NULL, &retval, &duration);
-	CHECK(duration > 500000000, /* 500ms */
-	      "test-run-signal-duration",
-	      "duration %dns > 500ms\n",
-	      duration);
-
-	signal(SIGALRM, SIG_DFL);
-}
-
 #define DECLARE
 #include <prog_tests/tests.h>
 #undef DECLARE
@@ -895,17 +171,6 @@ int main(void)
 #define CALL
 #include <prog_tests/tests.h>
 #undef CALL
-	test_prog_run_xattr();
-	test_l4lb_all();
-	test_tcp_estats();
-	test_bpf_obj_id();
-	test_obj_name();
-	test_reference_tracking();
-	test_queue_stack_map(QUEUE);
-	test_queue_stack_map(STACK);
-	test_flow_dissector();
-	test_signal_pending(BPF_PROG_TYPE_SOCKET_FILTER);
-	test_signal_pending(BPF_PROG_TYPE_FLOW_DISSECTOR);
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 66309327b4f8..51a07367cd43 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -81,6 +81,11 @@ extern struct ipv6_packet pkt_v6;
 #define NUM_ITER 100000
 #define VIP_NUM 5
 
+static inline __u64 ptr_to_u64(const void *ptr)
+{
+	return (__u64) (unsigned long) ptr;
+}
+
 int bpf_find_map(const char *test, struct bpf_object *obj, const char *name);
 int compare_map_keys(int map1_fd, int map2_fd);
 int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len);
-- 
cgit v1.2.3-59-g8ed1b


From 87dab7c3d54ce0f1ff6b54840bf7279d0944bc6a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 1 Mar 2019 22:08:21 +0100
Subject: bpf: add test cases for non-pointer sanitiation logic

Add two additional tests for further asserting the
BPF_ALU_NON_POINTER logic with cases that were missed
previously.

Cc: Marek Majkowski <marek@cloudflare.com>
Cc: Arthur Fabre <afabre@cloudflare.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../selftests/bpf/verifier/value_ptr_arith.c       | 44 +++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
index 4b721a77bebb..c3de1a2c9dc5 100644
--- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
@@ -173,7 +173,7 @@
 	.retval = 0,
 },
 {
-	"sanitation: alu with different scalars",
+	"sanitation: alu with different scalars 1",
 	.insns = {
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_LD_MAP_FD(BPF_REG_ARG1, 0),
@@ -198,6 +198,48 @@
 	.result = ACCEPT,
 	.retval = 0x100000,
 },
+{
+	"sanitation: alu with different scalars 2",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_LD_MAP_FD(BPF_REG_1, 0),
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+	BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0),
+	BPF_EMIT_CALL(BPF_FUNC_map_delete_elem),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+	BPF_EMIT_CALL(BPF_FUNC_map_delete_elem),
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_7),
+	BPF_MOV64_REG(BPF_REG_0, BPF_REG_8),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_map_array_48b = { 1 },
+	.result = ACCEPT,
+	.retval = -EINVAL * 2,
+},
+{
+	"sanitation: alu with different scalars 3",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, EINVAL),
+	BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, -1),
+	BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+	BPF_MOV64_IMM(BPF_REG_0, EINVAL),
+	BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, -1),
+	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+	BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),
+	BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_7),
+	BPF_MOV64_REG(BPF_REG_0, BPF_REG_8),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.retval = -EINVAL * 2,
+},
 {
 	"map access: value_ptr += known scalar, upper oob arith, test 1",
 	.insns = {
-- 
cgit v1.2.3-59-g8ed1b