aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/af_xdp.rst10
-rw-r--r--arch/arm64/net/bpf_jit.h3
-rw-r--r--arch/arm64/net/bpf_jit_comp.c6
-rw-r--r--arch/s390/net/bpf_jit_comp.c67
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_main.c5
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_xsk.c52
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_xsk.h2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c5
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c49
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/params.c23
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/params.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c8
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c5
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h14
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c15
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h14
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx.c7
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c27
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/cmsg.c187
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/fw.h1
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/main.c33
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/main.h24
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/offload.c3
-rw-r--r--drivers/net/ethernet/netronome/nfp/nfp_net.h2
-rw-r--r--drivers/net/ethernet/netronome/nfp/nfp_net_common.c9
-rw-r--r--include/linux/bpf.h5
-rw-r--r--include/linux/bpf_verifier.h1
-rw-r--r--include/linux/netdevice.h14
-rw-r--r--include/linux/tnum.h6
-rw-r--r--include/net/bpf_sk_storage.h10
-rw-r--r--include/net/xdp_sock.h122
-rw-r--r--include/uapi/linux/bpf.h15
-rw-r--r--include/uapi/linux/if_xdp.h22
-rw-r--r--kernel/bpf/btf.c16
-rw-r--r--kernel/bpf/syscall.c21
-rw-r--r--kernel/bpf/sysfs_btf.c9
-rw-r--r--kernel/bpf/verifier.c5
-rw-r--r--kernel/bpf/xskmap.c133
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--lib/test_bpf.c2
-rw-r--r--net/core/bpf_sk_storage.c104
-rw-r--r--net/core/dev.c15
-rw-r--r--net/core/filter.c2
-rw-r--r--net/core/sock.c9
-rw-r--r--net/xdp/xdp_umem.c67
-rw-r--r--net/xdp/xsk.c349
-rw-r--r--net/xdp/xsk.h13
-rw-r--r--net/xdp/xsk_diag.c5
-rw-r--r--net/xdp/xsk_queue.h71
-rw-r--r--samples/bpf/syscall_nrs.c6
-rw-r--r--samples/bpf/tracex5_kern.c13
-rw-r--r--samples/bpf/xdpsock_user.c243
-rwxr-xr-xscripts/link-vmlinux.sh6
-rw-r--r--tools/bpf/.gitignore1
-rw-r--r--tools/bpf/Makefile5
-rw-r--r--tools/bpf/bpftool/.gitignore2
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-btf.rst7
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-map.rst9
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-net.rst57
-rw-r--r--tools/bpf/bpftool/Makefile31
-rw-r--r--tools/bpf/bpftool/bash-completion/bpftool89
-rw-r--r--tools/bpf/bpftool/btf.c344
-rw-r--r--tools/bpf/bpftool/btf_dumper.c8
-rw-r--r--tools/bpf/bpftool/cgroup.c2
-rw-r--r--tools/bpf/bpftool/common.c4
-rw-r--r--tools/bpf/bpftool/json_writer.c6
-rw-r--r--tools/bpf/bpftool/json_writer.h6
-rw-r--r--tools/bpf/bpftool/main.c2
-rw-r--r--tools/bpf/bpftool/main.h4
-rw-r--r--tools/bpf/bpftool/map.c64
-rw-r--r--tools/bpf/bpftool/map_perf_ring.c4
-rw-r--r--tools/bpf/bpftool/net.c178
-rw-r--r--tools/bpf/bpftool/perf.c4
-rw-r--r--tools/include/linux/compiler-gcc.h2
-rw-r--r--tools/include/uapi/linux/bpf.h15
-rw-r--r--tools/include/uapi/linux/if_xdp.h22
-rw-r--r--tools/lib/bpf/Makefile26
-rw-r--r--tools/lib/bpf/bpf.c24
-rw-r--r--tools/lib/bpf/bpf.h1
-rw-r--r--tools/lib/bpf/libbpf.map6
-rw-r--r--tools/lib/bpf/xsk.c86
-rw-r--r--tools/lib/bpf/xsk.h33
-rw-r--r--tools/testing/selftests/bpf/.gitignore1
-rw-r--r--tools/testing/selftests/bpf/Makefile6
-rw-r--r--tools/testing/selftests/bpf/bpf_endian.h16
-rw-r--r--tools/testing/selftests/bpf/bpf_helpers.h2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c20
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c9
-rw-r--r--tools/testing/selftests/bpf/prog_tests/flow_dissector.c5
-rw-r--r--tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c3
-rw-r--r--tools/testing/selftests/bpf/prog_tests/global_data.c20
-rw-r--r--tools/testing/selftests/bpf/prog_tests/l4lb_all.c9
-rw-r--r--tools/testing/selftests/bpf/prog_tests/map_lock.c38
-rw-r--r--tools/testing/selftests/bpf/prog_tests/pkt_access.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/pkt_md_access.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/queue_stack_map.c8
-rw-r--r--tools/testing/selftests/bpf/prog_tests/reference_tracking.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/send_signal.c43
-rw-r--r--tools/testing/selftests/bpf/prog_tests/spinlock.c16
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c7
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c7
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_map.c17
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c9
-rw-r--r--tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c3
-rw-r--r--tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c5
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tcp_estats.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_noinline.c8
-rw-r--r--tools/testing/selftests/bpf/progs/sockopt_inherit.c97
-rw-r--r--tools/testing/selftests/bpf/progs/test_lwt_seg6local.c16
-rw-r--r--tools/testing/selftests/bpf/progs/test_seg6_loop.c8
-rwxr-xr-xtools/testing/selftests/bpf/test_bpftool_build.sh143
-rwxr-xr-xtools/testing/selftests/bpf/test_offload.py2
-rw-r--r--tools/testing/selftests/bpf/test_progs.c42
-rw-r--r--tools/testing/selftests/bpf/test_progs.h19
-rw-r--r--tools/testing/selftests/bpf/test_sockopt_inherit.c253
-rw-r--r--tools/testing/selftests/bpf/test_sysctl.c130
-rw-r--r--tools/testing/selftests/bpf/test_tcp_rtt.c31
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c68
-rw-r--r--tools/testing/selftests/bpf/verifier/precise.c194
124 files changed, 3489 insertions, 698 deletions
diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index eeedc2e826aa..83f7ae5fc045 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -153,10 +153,12 @@ an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has
Frames passed to the kernel are used for the ingress path (RX rings).
-The user application produces UMEM addrs to this ring. Note that the
-kernel will mask the incoming addr. E.g. for a chunk size of 2k, the
-log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050
-and 3000 refers to the same chunk.
+The user application produces UMEM addrs to this ring. Note that, if
+running the application with aligned chunk mode, the kernel will mask
+the incoming addr. E.g. for a chunk size of 2k, the log2(2048) LSB of
+the addr will be masked off, meaning that 2048, 2050 and 3000 refers
+to the same chunk. If the user application is run in the unaligned
+chunks mode, then the incoming addr will be left untouched.
UMEM Completion Ring
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index cb7ab50b7657..eb73f9f72c46 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -171,6 +171,9 @@
/* Rd = Ra + Rn * Rm */
#define A64_MADD(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
A64_VARIANT(sf), AARCH64_INSN_DATA3_MADD)
+/* Rd = Ra - Rn * Rm */
+#define A64_MSUB(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
+ A64_VARIANT(sf), AARCH64_INSN_DATA3_MSUB)
/* Rd = Rn * Rm */
#define A64_MUL(sf, Rd, Rn, Rm) A64_MADD(sf, Rd, A64_ZR, Rn, Rm)
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index f5b437f8a22b..cdc79de0c794 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -409,8 +409,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
break;
case BPF_MOD:
emit(A64_UDIV(is64, tmp, dst, src), ctx);
- emit(A64_MUL(is64, tmp, tmp, src), ctx);
- emit(A64_SUB(is64, dst, dst, tmp), ctx);
+ emit(A64_MSUB(is64, dst, dst, tmp, src), ctx);
break;
}
break;
@@ -516,8 +515,7 @@ emit_bswap_uxt:
case BPF_ALU64 | BPF_MOD | BPF_K:
emit_a64_mov_i(is64, tmp2, imm, ctx);
emit(A64_UDIV(is64, tmp, dst, tmp2), ctx);
- emit(A64_MUL(is64, tmp, tmp, tmp2), ctx);
- emit(A64_SUB(is64, dst, dst, tmp), ctx);
+ emit(A64_MSUB(is64, dst, dst, tmp, tmp2), ctx);
break;
case BPF_ALU | BPF_LSH | BPF_K:
case BPF_ALU64 | BPF_LSH | BPF_K:
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 955eb355c2fd..ce88211b9c6c 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -502,7 +502,8 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
* NOTE: Use noinline because for gcov (-fprofile-arcs) gcc allocates a lot of
* stack space for the large switch statement.
*/
-static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
+static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
+ int i, bool extra_pass)
{
struct bpf_insn *insn = &fp->insnsi[i];
int jmp_off, last, insn_count = 1;
@@ -1011,10 +1012,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
*/
case BPF_JMP | BPF_CALL:
{
- /*
- * b0 = (__bpf_call_base + imm)(b1, b2, b3, b4, b5)
- */
- const u64 func = (u64)__bpf_call_base + imm;
+ u64 func;
+ bool func_addr_fixed;
+ int ret;
+
+ ret = bpf_jit_get_func_addr(fp, insn, extra_pass,
+ &func, &func_addr_fixed);
+ if (ret < 0)
+ return -1;
REG_SET_SEEN(BPF_REG_5);
jit->seen |= SEEN_FUNC;
@@ -1283,7 +1288,8 @@ branch_oc:
/*
* Compile eBPF program into s390x code
*/
-static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
+static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
+ bool extra_pass)
{
int i, insn_count;
@@ -1292,7 +1298,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
bpf_jit_prologue(jit, fp->aux->stack_depth);
for (i = 0; i < fp->len; i += insn_count) {
- insn_count = bpf_jit_insn(jit, fp, i);
+ insn_count = bpf_jit_insn(jit, fp, i, extra_pass);
if (insn_count < 0)
return -1;
/* Next instruction address */
@@ -1311,6 +1317,12 @@ bool bpf_jit_needs_zext(void)
return true;
}
+struct s390_jit_data {
+ struct bpf_binary_header *header;
+ struct bpf_jit ctx;
+ int pass;
+};
+
/*
* Compile eBPF program "fp"
*/
@@ -1318,7 +1330,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
{
struct bpf_prog *tmp, *orig_fp = fp;
struct bpf_binary_header *header;
+ struct s390_jit_data *jit_data;
bool tmp_blinded = false;
+ bool extra_pass = false;
struct bpf_jit jit;
int pass;
@@ -1337,6 +1351,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
fp = tmp;
}
+ jit_data = fp->aux->jit_data;
+ if (!jit_data) {
+ jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+ if (!jit_data) {
+ fp = orig_fp;
+ goto out;
+ }
+ fp->aux->jit_data = jit_data;
+ }
+ if (jit_data->ctx.addrs) {
+ jit = jit_data->ctx;
+ header = jit_data->header;
+ extra_pass = true;
+ pass = jit_data->pass + 1;
+ goto skip_init_ctx;
+ }
+
memset(&jit, 0, sizeof(jit));
jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
if (jit.addrs == NULL) {
@@ -1349,7 +1380,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
* - 3: Calculate program size and addrs arrray
*/
for (pass = 1; pass <= 3; pass++) {
- if (bpf_jit_prog(&jit, fp)) {
+ if (bpf_jit_prog(&jit, fp, extra_pass)) {
fp = orig_fp;
goto free_addrs;
}
@@ -1361,12 +1392,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
fp = orig_fp;
goto free_addrs;
}
+
header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 2, jit_fill_hole);
if (!header) {
fp = orig_fp;
goto free_addrs;
}
- if (bpf_jit_prog(&jit, fp)) {
+skip_init_ctx:
+ if (bpf_jit_prog(&jit, fp, extra_pass)) {
bpf_jit_binary_free(header);
fp = orig_fp;
goto free_addrs;
@@ -1375,12 +1408,24 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
bpf_jit_dump(fp->len, jit.size, pass, jit.prg_buf);
print_fn_code(jit.prg_buf, jit.size_prg);
}
- bpf_jit_binary_lock_ro(header);
+ if (!fp->is_func || extra_pass) {
+ bpf_jit_binary_lock_ro(header);
+ } else {
+ jit_data->header = header;
+ jit_data->ctx = jit;
+ jit_data->pass = pass;
+ }
fp->bpf_func = (void *) jit.prg_buf;
fp->jited = 1;
fp->jited_len = jit.size;
+
+ if (!fp->is_func || extra_pass) {
+ bpf_prog_fill_jited_linfo(fp, jit.addrs + 1);
free_addrs:
- kfree(jit.addrs);
+ kfree(jit.addrs);
+ kfree(jit_data);
+ fp->aux->jit_data = NULL;
+ }
out:
if (tmp_blinded)
bpf_jit_prog_release_other(fp, fp == orig_fp ?
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index fdf43d87e983..3c8a2f55c43a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -12530,7 +12530,8 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi,
if (need_reset && prog)
for (i = 0; i < vsi->num_queue_pairs; i++)
if (vsi->xdp_rings[i]->xsk_umem)
- (void)i40e_xsk_async_xmit(vsi->netdev, i);
+ (void)i40e_xsk_wakeup(vsi->netdev, i,
+ XDP_WAKEUP_RX);
return 0;
}
@@ -12852,7 +12853,7 @@ static const struct net_device_ops i40e_netdev_ops = {
.ndo_bridge_setlink = i40e_ndo_bridge_setlink,
.ndo_bpf = i40e_xdp,
.ndo_xdp_xmit = i40e_xdp_xmit,
- .ndo_xsk_async_xmit = i40e_xsk_async_xmit,
+ .ndo_xsk_wakeup = i40e_xsk_wakeup,
.ndo_dfwd_add_station = i40e_fwd_add,
.ndo_dfwd_del_station = i40e_fwd_del,
};
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 32bad014d76c..0373bc6c7e61 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -116,7 +116,7 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
return err;
/* Kick start the NAPI context so that receiving will start */
- err = i40e_xsk_async_xmit(vsi->netdev, qid);
+ err = i40e_xsk_wakeup(vsi->netdev, qid, XDP_WAKEUP_RX);
if (err)
return err;
}
@@ -190,7 +190,9 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
**/
static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
{
+ struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = I40E_XDP_PASS;
+ u64 offset = umem->headroom;
struct i40e_ring *xdp_ring;
struct bpf_prog *xdp_prog;
u32 act;
@@ -201,7 +203,10 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
*/
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
- xdp->handle += xdp->data - xdp->data_hard_start;
+ offset += xdp->data - xdp->data_hard_start;
+
+ xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
+
switch (act) {
case XDP_PASS:
break;
@@ -262,7 +267,7 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
- bi->handle = handle + umem->headroom;
+ bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_discard_addr(umem);
return true;
@@ -299,7 +304,7 @@ static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
- bi->handle = handle + umem->headroom;
+ bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_discard_addr_rq(umem);
return true;
@@ -420,8 +425,6 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *old_bi)
{
struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
- unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
- u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
u16 nta = rx_ring->next_to_alloc;
/* update, and store next to alloc */
@@ -429,14 +432,9 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
/* transfer page from old buffer to new buffer */
- new_bi->dma = old_bi->dma & mask;
- new_bi->dma += hr;
-
- new_bi->addr = (void *)((unsigned long)old_bi->addr & mask);
- new_bi->addr += hr;
-
- new_bi->handle = old_bi->handle & mask;
- new_bi->handle += rx_ring->xsk_umem->headroom;
+ new_bi->dma = old_bi->dma;
+ new_bi->addr = old_bi->addr;
+ new_bi->handle = old_bi->handle;
old_bi->addr = NULL;
}
@@ -471,7 +469,8 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
bi->addr += hr;
- bi->handle = (u64)handle + rx_ring->xsk_umem->headroom;
+ bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
+ rx_ring->xsk_umem->headroom);
}
/**
@@ -626,6 +625,15 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
+
+ if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
+ if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+ xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
+ else
+ xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
+
+ return (int)total_rx_packets;
+ }
return failure ? budget : (int)total_rx_packets;
}
@@ -681,6 +689,8 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
i40e_xdp_ring_update_tail(xdp_ring);
xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
+ if (xsk_umem_uses_need_wakeup(xdp_ring->xsk_umem))
+ xsk_clear_tx_need_wakeup(xdp_ring->xsk_umem);
}
return !!budget && work_done;
@@ -759,19 +769,27 @@ bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
i40e_update_tx_stats(tx_ring, completed_frames, total_bytes);
out_xmit:
+ if (xsk_umem_uses_need_wakeup(tx_ring->xsk_umem)) {
+ if (tx_ring->next_to_clean == tx_ring->next_to_use)
+ xsk_set_tx_need_wakeup(tx_ring->xsk_umem);
+ else
+ xsk_clear_tx_need_wakeup(tx_ring->xsk_umem);
+ }
+
xmit_done = i40e_xmit_zc(tx_ring, budget);
return work_done && xmit_done;
}
/**
- * i40e_xsk_async_xmit - Implements the ndo_xsk_async_xmit
+ * i40e_xsk_wakeup - Implements the ndo_xsk_wakeup
* @dev: the netdevice
* @queue_id: queue id to wake up
+ * @flags: ignored in our case since we have Rx and Tx in the same NAPI.
*
* Returns <0 for errors, 0 otherwise.
**/
-int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id)
+int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
{
struct i40e_netdev_priv *np = netdev_priv(dev);
struct i40e_vsi *vsi = np->vsi;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
index 8cc0a2e7d9a2..9ed59c14eb55 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h
@@ -18,6 +18,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
struct i40e_ring *tx_ring, int napi_budget);
-int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id);
+int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
#endif /* _I40E_XSK_H_ */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 17b7ae9f46ec..9bcae44e9883 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10260,7 +10260,8 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
if (need_reset && prog)
for (i = 0; i < adapter->num_rx_queues; i++)
if (adapter->xdp_ring[i]->xsk_umem)
- (void)ixgbe_xsk_async_xmit(adapter->netdev, i);
+ (void)ixgbe_xsk_wakeup(adapter->netdev, i,
+ XDP_WAKEUP_RX);
return 0;
}
@@ -10379,7 +10380,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_features_check = ixgbe_features_check,
.ndo_bpf = ixgbe_xdp,
.ndo_xdp_xmit = ixgbe_xdp_xmit,
- .ndo_xsk_async_xmit = ixgbe_xsk_async_xmit,
+ .ndo_xsk_wakeup = ixgbe_xsk_wakeup,
};
static void ixgbe_disable_txr_hw(struct ixgbe_adapter *adapter,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h
index d93a690aff74..6d01700b46bc 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h
@@ -42,7 +42,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring);
bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *tx_ring, int napi_budget);
-int ixgbe_xsk_async_xmit(struct net_device *dev, u32 queue_id);
+int ixgbe_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
void ixgbe_xsk_clean_tx_ring(struct ixgbe_ring *tx_ring);
#endif /* #define _IXGBE_TXRX_COMMON_H_ */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index 6b609553329f..ad802a8909e0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -100,7 +100,7 @@ static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
ixgbe_txrx_ring_enable(adapter, qid);
/* Kick start the NAPI context so that receiving will start */
- err = ixgbe_xsk_async_xmit(adapter->netdev, qid);
+ err = ixgbe_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX);
if (err)
return err;
}
@@ -143,7 +143,9 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
struct ixgbe_ring *rx_ring,
struct xdp_buff *xdp)
{
+ struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = IXGBE_XDP_PASS;
+ u64 offset = umem->headroom;
struct bpf_prog *xdp_prog;
struct xdp_frame *xdpf;
u32 act;
@@ -151,7 +153,10 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
rcu_read_lock();
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
- xdp->handle += xdp->data - xdp->data_hard_start;
+ offset += xdp->data - xdp->data_hard_start;
+
+ xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
+
switch (act) {
case XDP_PASS:
break;
@@ -201,8 +206,6 @@ ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring,
static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *obi)
{
- unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
- u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
u16 nta = rx_ring->next_to_alloc;
struct ixgbe_rx_buffer *nbi;
@@ -212,14 +215,9 @@ static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
/* transfer page from old buffer to new buffer */
- nbi->dma = obi->dma & mask;
- nbi->dma += hr;
-
- nbi->addr = (void *)((unsigned long)obi->addr & mask);
- nbi->addr += hr;
-
- nbi->handle = obi->handle & mask;
- nbi->handle += rx_ring->xsk_umem->headroom;
+ nbi->dma = obi->dma;
+ nbi->addr = obi->addr;
+ nbi->handle = obi->handle;
obi->addr = NULL;
obi->skb = NULL;
@@ -250,7 +248,8 @@ void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
bi->addr += hr;
- bi->handle = (u64)handle + rx_ring->xsk_umem->headroom;
+ bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
+ rx_ring->xsk_umem->headroom);
}
static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
@@ -276,7 +275,7 @@ static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
- bi->handle = handle + umem->headroom;
+ bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_discard_addr(umem);
return true;
@@ -303,7 +302,7 @@ static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring,
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
- bi->handle = handle + umem->headroom;
+ bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_discard_addr_rq(umem);
return true;
@@ -547,6 +546,14 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
q_vector->rx.total_packets += total_rx_packets;
q_vector->rx.total_bytes += total_rx_bytes;
+ if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
+ if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+ xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
+ else
+ xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
+
+ return (int)total_rx_packets;
+ }
return failure ? budget : (int)total_rx_packets;
}
@@ -615,6 +622,8 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
if (tx_desc) {
ixgbe_xdp_ring_update_tail(xdp_ring);
xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
+ if (xsk_umem_uses_need_wakeup(xdp_ring->xsk_umem))
+ xsk_clear_tx_need_wakeup(xdp_ring->xsk_umem);
}
return !!budget && work_done;
@@ -688,11 +697,19 @@ bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
if (xsk_frames)
xsk_umem_complete_tx(umem, xsk_frames);
+ if (xsk_umem_uses_need_wakeup(tx_ring->xsk_umem)) {
+ if (tx_ring->next_to_clean == tx_ring->next_to_use)
+ xsk_set_tx_need_wakeup(tx_ring->xsk_umem);
+ else
+ xsk_clear_tx_need_wakeup(tx_ring->xsk_umem);
+ }
+
xmit_done = ixgbe_xmit_zc(tx_ring, q_vector->tx.work_limit);
+
return budget > 0 && xmit_done;
}
-int ixgbe_xsk_async_xmit(struct net_device *dev, u32 qid)
+int ixgbe_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
{
struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 79301d116667..eb2e1f2138e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -25,18 +25,33 @@ u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
return headroom;
}
-u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
- struct mlx5e_xsk_param *xsk)
+u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
{
u32 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
u16 linear_rq_headroom = mlx5e_get_linear_rq_headroom(params, xsk);
- u32 frag_sz = linear_rq_headroom + hw_mtu;
+
+ return linear_rq_headroom + hw_mtu;
+}
+
+u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
+{
+ u32 frag_sz = mlx5e_rx_get_min_frag_sz(params, xsk);
/* AF_XDP doesn't build SKBs in place. */
if (!xsk)
frag_sz = MLX5_SKB_FRAG_SZ(frag_sz);
- /* XDP in mlx5e doesn't support multiple packets per page. */
+ /* XDP in mlx5e doesn't support multiple packets per page. AF_XDP is a
+ * special case. It can run with frames smaller than a page, as it
+ * doesn't allocate pages dynamically. However, here we pretend that
+ * fragments are page-sized: it allows to treat XSK frames like pages
+ * by redirecting alloc and free operations to XSK rings and by using
+ * the fact there are no multiple packets per "page" (which is a frame).
+ * The latter is important, because frames may come in a random order,
+ * and we will have trouble assemblying a real page of multiple frames.
+ */
if (mlx5e_rx_is_xdp(params, xsk))
frag_sz = max_t(u32, frag_sz, PAGE_SIZE);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
index 3a615d663d84..989d8f429438 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
@@ -76,6 +76,8 @@ static inline bool mlx5e_qid_validate(const struct mlx5e_profile *profile,
u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
+u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 1ed5c33e022f..f049e0ac308a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -122,6 +122,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
void *va, u16 *rx_headroom, u32 *len, bool xsk)
{
struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
+ struct xdp_umem *umem = rq->umem;
struct xdp_buff xdp;
u32 act;
int err;
@@ -138,8 +139,11 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
xdp.rxq = &rq->xdp_rxq;
act = bpf_prog_run_xdp(prog, &xdp);
- if (xsk)
- xdp.handle += xdp.data - xdp.data_hard_start;
+ if (xsk) {
+ u64 off = xdp.data - xdp.data_hard_start;
+
+ xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
+ }
switch (act) {
case XDP_PASS:
*rx_headroom = xdp.data - xdp.data_hard_start;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 6a55573ec8f2..475b6bd5d29b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -24,7 +24,8 @@ int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
if (!xsk_umem_peek_addr_rq(umem, &handle))
return -ENOMEM;
- dma_info->xsk.handle = handle + rq->buff.umem_headroom;
+ dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
+ rq->buff.umem_headroom);
dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
/* No need to add headroom to the DMA address. In striding RQ case, we
@@ -104,7 +105,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
/* head_offset is not used in this function, because di->xsk.data and
* di->addr point directly to the necessary place. Furthermore, in the
- * current implementation, one page = one packet = one frame, so
+ * current implementation, UMR pages are mapped to XSK frames, so
* head_offset should always be 0.
*/
WARN_ON_ONCE(head_offset);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index 307b923a1361..cab0e93497ae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -5,6 +5,7 @@
#define __MLX5_EN_XSK_RX_H__
#include "en.h"
+#include <net/xdp_sock.h>
/* RX data path */
@@ -24,4 +25,17 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt);
+static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
+{
+ if (!xsk_umem_uses_need_wakeup(rq->umem))
+ return alloc_err;
+
+ if (unlikely(alloc_err))
+ xsk_set_rx_need_wakeup(rq->umem);
+ else
+ xsk_clear_rx_need_wakeup(rq->umem);
+
+ return false;
+}
+
#endif /* __MLX5_EN_XSK_RX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
index d360750b25b7..631af8dee517 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
@@ -4,18 +4,23 @@
#include "setup.h"
#include "en/params.h"
+/* It matches XDP_UMEM_MIN_CHUNK_SIZE, but as this constant is private and may
+ * change unexpectedly, and mlx5e has a minimum valid stride size for striding
+ * RQ, keep this check in the driver.
+ */
+#define MLX5E_MIN_XSK_CHUNK_SIZE 2048
+
bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk,
struct mlx5_core_dev *mdev)
{
- /* AF_XDP doesn't support frames larger than PAGE_SIZE, and the current
- * mlx5e XDP implementation doesn't support multiple packets per page.
- */
- if (xsk->chunk_size != PAGE_SIZE)
+ /* AF_XDP doesn't support frames larger than PAGE_SIZE. */
+ if (xsk->chunk_size > PAGE_SIZE ||
+ xsk->chunk_size < MLX5E_MIN_XSK_CHUNK_SIZE)
return false;
/* Current MTU and XSK headroom don't allow packets to fit the frames. */
- if (mlx5e_rx_get_linear_frag_sz(params, xsk) > xsk->chunk_size)
+ if (mlx5e_rx_get_min_frag_sz(params, xsk) > xsk->chunk_size)
return false;
/* frag_sz is different for regular and XSK RQs, so ensure that linear
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index fd2c75b4b519..87827477d38c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -7,7 +7,7 @@
#include "en/params.h"
#include <net/xdp_sock.h>
-int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid)
+int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
{
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_params *params = &priv->channels.params;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
index 7add18bf78d8..79b487d89757 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
@@ -5,11 +5,23 @@
#define __MLX5_EN_XSK_TX_H__
#include "en.h"
+#include <net/xdp_sock.h>
/* TX data path */
-int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid);
+int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget);
+static inline void mlx5e_xsk_update_tx_wakeup(struct mlx5e_xdpsq *sq)
+{
+ if (!xsk_umem_uses_need_wakeup(sq->umem))
+ return;
+
+ if (sq->pc != sq->cc)
+ xsk_clear_tx_need_wakeup(sq->umem);
+ else
+ xsk_set_tx_need_wakeup(sq->umem);
+}
+
#endif /* __MLX5_EN_XSK_TX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index dadadf221087..1cacda1bc1b4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4580,7 +4580,7 @@ const struct net_device_ops mlx5e_netdev_ops = {
.ndo_tx_timeout = mlx5e_tx_timeout,
.ndo_bpf = mlx5e_xdp,
.ndo_xdp_xmit = mlx5e_xdp_xmit,
- .ndo_xsk_async_xmit = mlx5e_xsk_async_xmit,
+ .ndo_xsk_wakeup = mlx5e_xsk_wakeup,
#ifdef CONFIG_MLX5_EN_ARFS
.ndo_rx_flow_steer = mlx5e_rx_flow_steer,
#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 2fd2760d0bb7..d6a547238de0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -695,8 +695,11 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
rq->mpwqe.umr_in_progress += rq->mpwqe.umr_last_bulk;
rq->mpwqe.actual_wq_head = head;
- /* If XSK Fill Ring doesn't have enough frames, busy poll by
- * rescheduling the NAPI poll.
+ /* If XSK Fill Ring doesn't have enough frames, report the error, so
+ * that one of the actions can be performed:
+ * 1. If need_wakeup is used, signal that the application has to kick
+ * the driver when it refills the Fill Ring.
+ * 2. Otherwise, busy poll by rescheduling the NAPI poll.
*/
if (unlikely(alloc_err == -ENOMEM && rq->umem))
return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 49b06b256c92..257a7c9f7a14 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -33,6 +33,7 @@
#include <linux/irq.h>
#include "en.h"
#include "en/xdp.h"
+#include "en/xsk/rx.h"
#include "en/xsk/tx.h"
static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
@@ -81,6 +82,29 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq)
mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl);
}
+static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq)
+{
+ bool busy_xsk = false, xsk_rx_alloc_err;
+
+ /* Handle the race between the application querying need_wakeup and the
+ * driver setting it:
+ * 1. Update need_wakeup both before and after the TX. If it goes to
+ * "yes", it can only happen with the first update.
+ * 2. If the application queried need_wakeup before we set it, the
+ * packets will be transmitted anyway, even w/o a wakeup.
+ * 3. Give a chance to clear need_wakeup after new packets were queued
+ * for TX.
+ */
+ mlx5e_xsk_update_tx_wakeup(xsksq);
+ busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
+ mlx5e_xsk_update_tx_wakeup(xsksq);
+
+ xsk_rx_alloc_err = xskrq->post_wqes(xskrq);
+ busy_xsk |= mlx5e_xsk_update_rx_wakeup(xskrq, xsk_rx_alloc_err);
+
+ return busy_xsk;
+}
+
int mlx5e_napi_poll(struct napi_struct *napi, int budget)
{
struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
@@ -122,8 +146,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
if (xsk_open) {
mlx5e_poll_ico_cq(&c->xskicosq.cq);
busy |= mlx5e_poll_xdpsq_cq(&xsksq->cq);
- busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
- busy_xsk |= xskrq->post_wqes(xskrq);
+ busy_xsk |= mlx5e_napi_xsk_post(xsksq, xskrq);
}
busy |= busy_xsk;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
index bc9850e4ec5e..0e2db6ea79e9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
@@ -6,6 +6,7 @@
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/skbuff.h>
+#include <linux/timekeeping.h>
#include "../ccm.h"
#include "../nfp_app.h"
@@ -175,29 +176,151 @@ nfp_bpf_ctrl_reply_val(struct nfp_app_bpf *bpf, struct cmsg_reply_map_op *reply,
return &reply->data[bpf->cmsg_key_sz * (n + 1) + bpf->cmsg_val_sz * n];
}
+static bool nfp_bpf_ctrl_op_cache_invalidate(enum nfp_ccm_type op)
+{
+ return op == NFP_CCM_TYPE_BPF_MAP_UPDATE ||
+ op == NFP_CCM_TYPE_BPF_MAP_DELETE;
+}
+
+static bool nfp_bpf_ctrl_op_cache_capable(enum nfp_ccm_type op)
+{
+ return op == NFP_CCM_TYPE_BPF_MAP_LOOKUP ||
+ op == NFP_CCM_TYPE_BPF_MAP_GETNEXT;
+}
+
+static bool nfp_bpf_ctrl_op_cache_fill(enum nfp_ccm_type op)
+{
+ return op == NFP_CCM_TYPE_BPF_MAP_GETFIRST ||
+ op == NFP_CCM_TYPE_BPF_MAP_GETNEXT;
+}
+
+static unsigned int
+nfp_bpf_ctrl_op_cache_get(struct nfp_bpf_map *nfp_map, enum nfp_ccm_type op,
+ const u8 *key, u8 *out_key, u8 *out_value,
+ u32 *cache_gen)
+{
+ struct bpf_map *map = &nfp_map->offmap->map;
+ struct nfp_app_bpf *bpf = nfp_map->bpf;
+ unsigned int i, count, n_entries;
+ struct cmsg_reply_map_op *reply;
+
+ n_entries = nfp_bpf_ctrl_op_cache_fill(op) ? bpf->cmsg_cache_cnt : 1;
+
+ spin_lock(&nfp_map->cache_lock);
+ *cache_gen = nfp_map->cache_gen;
+ if (nfp_map->cache_blockers)
+ n_entries = 1;
+
+ if (nfp_bpf_ctrl_op_cache_invalidate(op))
+ goto exit_block;
+ if (!nfp_bpf_ctrl_op_cache_capable(op))
+ goto exit_unlock;
+
+ if (!nfp_map->cache)
+ goto exit_unlock;
+ if (nfp_map->cache_to < ktime_get_ns())
+ goto exit_invalidate;
+
+ reply = (void *)nfp_map->cache->data;
+ count = be32_to_cpu(reply->count);
+
+ for (i = 0; i < count; i++) {
+ void *cached_key;
+
+ cached_key = nfp_bpf_ctrl_reply_key(bpf, reply, i);
+ if (memcmp(cached_key, key, map->key_size))
+ continue;
+
+ if (op == NFP_CCM_TYPE_BPF_MAP_LOOKUP)
+ memcpy(out_value, nfp_bpf_ctrl_reply_val(bpf, reply, i),
+ map->value_size);
+ if (op == NFP_CCM_TYPE_BPF_MAP_GETNEXT) {
+ if (i + 1 == count)
+ break;
+
+ memcpy(out_key,
+ nfp_bpf_ctrl_reply_key(bpf, reply, i + 1),
+ map->key_size);
+ }
+
+ n_entries = 0;
+ goto exit_unlock;
+ }
+ goto exit_unlock;
+
+exit_block:
+ nfp_map->cache_blockers++;
+exit_invalidate:
+ dev_consume_skb_any(nfp_map->cache);
+ nfp_map->cache = NULL;
+exit_unlock:
+ spin_unlock(&nfp_map->cache_lock);
+ return n_entries;
+}
+
+static void
+nfp_bpf_ctrl_op_cache_put(struct nfp_bpf_map *nfp_map, enum nfp_ccm_type op,
+ struct sk_buff *skb, u32 cache_gen)
+{
+ bool blocker, filler;
+
+ blocker = nfp_bpf_ctrl_op_cache_invalidate(op);
+ filler = nfp_bpf_ctrl_op_cache_fill(op);
+ if (blocker || filler) {
+ u64 to = 0;
+
+ if (filler)
+ to = ktime_get_ns() + NFP_BPF_MAP_CACHE_TIME_NS;
+
+ spin_lock(&nfp_map->cache_lock);
+ if (blocker) {
+ nfp_map->cache_blockers--;
+ nfp_map->cache_gen++;
+ }
+ if (filler && !nfp_map->cache_blockers &&
+ nfp_map->cache_gen == cache_gen) {
+ nfp_map->cache_to = to;
+ swap(nfp_map->cache, skb);
+ }
+ spin_unlock(&nfp_map->cache_lock);
+ }
+
+ dev_consume_skb_any(skb);
+}
+
static int
nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, enum nfp_ccm_type op,
u8 *key, u8 *value, u64 flags, u8 *out_key, u8 *out_value)
{
struct nfp_bpf_map *nfp_map = offmap->dev_priv;
+ unsigned int n_entries, reply_entries, count;
struct nfp_app_bpf *bpf = nfp_map->bpf;
struct bpf_map *map = &offmap->map;
struct cmsg_reply_map_op *reply;
struct cmsg_req_map_op *req;
struct sk_buff *skb;
+ u32 cache_gen;
int err;
/* FW messages have no space for more than 32 bits of flags */
if (flags >> 32)
return -EOPNOTSUPP;
+ /* Handle op cache */
+ n_entries = nfp_bpf_ctrl_op_cache_get(nfp_map, op, key, out_key,
+ out_value, &cache_gen);
+ if (!n_entries)
+ return 0;
+
skb = nfp_bpf_cmsg_map_req_alloc(bpf, 1);
- if (!skb)
- return -ENOMEM;
+ if (!skb) {
+ err = -ENOMEM;
+ goto err_cache_put;
+ }
req = (void *)skb->data;
req->tid = cpu_to_be32(nfp_map->tid);
- req->count = cpu_to_be32(1);
+ req->count = cpu_to_be32(n_entries);
req->flags = cpu_to_be32(flags);
/* Copy inputs */
@@ -207,16 +330,38 @@ nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, enum nfp_ccm_type op,
memcpy(nfp_bpf_ctrl_req_val(bpf, req, 0), value,
map->value_size);
- skb = nfp_ccm_communicate(&bpf->ccm, skb, op,
- nfp_bpf_cmsg_map_reply_size(bpf, 1));
- if (IS_ERR(skb))
- return PTR_ERR(skb);
+ skb = nfp_ccm_communicate(&bpf->ccm, skb, op, 0);
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ goto err_cache_put;
+ }
+
+ if (skb->len < sizeof(*reply)) {
+ cmsg_warn(bpf, "cmsg drop - type 0x%02x too short %d!\n",
+ op, skb->len);
+ err = -EIO;
+ goto err_free;
+ }
reply = (void *)skb->data;
+ count = be32_to_cpu(reply->count);
err = nfp_bpf_ctrl_rc_to_errno(bpf, &reply->reply_hdr);
+ /* FW responds with message sized to hold the good entries,
+ * plus one extra entry if there was an error.
+ */
+ reply_entries = count + !!err;
+ if (n_entries > 1 && count)
+ err = 0;
if (err)
goto err_free;
+ if (skb->len != nfp_bpf_cmsg_map_reply_size(bpf, reply_entries)) {
+ cmsg_warn(bpf, "cmsg drop - type 0x%02x too short %d for %d entries!\n",
+ op, skb->len, reply_entries);
+ err = -EIO;
+ goto err_free;
+ }
+
/* Copy outputs */
if (out_key)
memcpy(out_key, nfp_bpf_ctrl_reply_key(bpf, reply, 0),
@@ -225,11 +370,13 @@ nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, enum nfp_ccm_type op,
memcpy(out_value, nfp_bpf_ctrl_reply_val(bpf, reply, 0),
map->value_size);
- dev_consume_skb_any(skb);
+ nfp_bpf_ctrl_op_cache_put(nfp_map, op, skb, cache_gen);
return 0;
err_free:
dev_kfree_skb_any(skb);
+err_cache_put:
+ nfp_bpf_ctrl_op_cache_put(nfp_map, op, NULL, cache_gen);
return err;
}
@@ -267,11 +414,29 @@ int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap,
key, NULL, 0, next_key, NULL);
}
+unsigned int nfp_bpf_ctrl_cmsg_min_mtu(struct nfp_app_bpf *bpf)
+{
+ return max(nfp_bpf_cmsg_map_req_size(bpf, 1),
+ nfp_bpf_cmsg_map_reply_size(bpf, 1));
+}
+
unsigned int nfp_bpf_ctrl_cmsg_mtu(struct nfp_app_bpf *bpf)
{
- return max3((unsigned int)NFP_NET_DEFAULT_MTU,
- nfp_bpf_cmsg_map_req_size(bpf, 1),
- nfp_bpf_cmsg_map_reply_size(bpf, 1));
+ return max3(NFP_NET_DEFAULT_MTU,
+ nfp_bpf_cmsg_map_req_size(bpf, NFP_BPF_MAP_CACHE_CNT),
+ nfp_bpf_cmsg_map_reply_size(bpf, NFP_BPF_MAP_CACHE_CNT));
+}
+
+unsigned int nfp_bpf_ctrl_cmsg_cache_cnt(struct nfp_app_bpf *bpf)
+{
+ unsigned int mtu, req_max, reply_max, entry_sz;
+
+ mtu = bpf->app->ctrl->dp.mtu;
+ entry_sz = bpf->cmsg_key_sz + bpf->cmsg_val_sz;
+ req_max = (mtu - sizeof(struct cmsg_req_map_op)) / entry_sz;
+ reply_max = (mtu - sizeof(struct cmsg_reply_map_op)) / entry_sz;
+
+ return min3(req_max, reply_max, NFP_BPF_MAP_CACHE_CNT);
}
void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb)
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
index 06c4286bd79e..a83a0ad5e27d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h
@@ -24,6 +24,7 @@ enum bpf_cap_tlv_type {
NFP_BPF_CAP_TYPE_QUEUE_SELECT = 5,
NFP_BPF_CAP_TYPE_ADJUST_TAIL = 6,
NFP_BPF_CAP_TYPE_ABI_VERSION = 7,
+ NFP_BPF_CAP_TYPE_CMSG_MULTI_ENT = 8,
};
struct nfp_bpf_cap_tlv_func {
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 1c9fb11470df..8f732771d3fa 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -300,6 +300,14 @@ nfp_bpf_parse_cap_adjust_tail(struct nfp_app_bpf *bpf, void __iomem *value,
}
static int
+nfp_bpf_parse_cap_cmsg_multi_ent(struct nfp_app_bpf *bpf, void __iomem *value,
+ u32 length)
+{
+ bpf->cmsg_multi_ent = true;
+ return 0;
+}
+
+static int
nfp_bpf_parse_cap_abi_version(struct nfp_app_bpf *bpf, void __iomem *value,
u32 length)
{
@@ -375,6 +383,11 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app)
length))
goto err_release_free;
break;
+ case NFP_BPF_CAP_TYPE_CMSG_MULTI_ENT:
+ if (nfp_bpf_parse_cap_cmsg_multi_ent(app->priv, value,
+ length))
+ goto err_release_free;
+ break;
default:
nfp_dbg(cpp, "unknown BPF capability: %d\n", type);
break;
@@ -415,6 +428,25 @@ static void nfp_bpf_ndo_uninit(struct nfp_app *app, struct net_device *netdev)
bpf_offload_dev_netdev_unregister(bpf->bpf_dev, netdev);
}
+static int nfp_bpf_start(struct nfp_app *app)
+{
+ struct nfp_app_bpf *bpf = app->priv;
+
+ if (app->ctrl->dp.mtu < nfp_bpf_ctrl_cmsg_min_mtu(bpf)) {
+ nfp_err(bpf->app->cpp,
+ "ctrl channel MTU below min required %u < %u\n",
+ app->ctrl->dp.mtu, nfp_bpf_ctrl_cmsg_min_mtu(bpf));
+ return -EINVAL;
+ }
+
+ if (bpf->cmsg_multi_ent)
+ bpf->cmsg_cache_cnt = nfp_bpf_ctrl_cmsg_cache_cnt(bpf);
+ else
+ bpf->cmsg_cache_cnt = 1;
+
+ return 0;
+}
+
static int nfp_bpf_init(struct nfp_app *app)
{
struct nfp_app_bpf *bpf;
@@ -488,6 +520,7 @@ const struct nfp_app_type app_bpf = {
.init = nfp_bpf_init,
.clean = nfp_bpf_clean,
+ .start = nfp_bpf_start,
.check_mtu = nfp_bpf_check_mtu,
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 57d6ff51e980..fac9c6f9e197 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -99,6 +99,7 @@ enum pkt_vec {
* @maps_neutral: hash table of offload-neutral maps (on pointer)
*
* @abi_version: global BPF ABI version
+ * @cmsg_cache_cnt: number of entries to read for caching
*
* @adjust_head: adjust head capability
* @adjust_head.flags: extra flags for adjust head
@@ -124,6 +125,7 @@ enum pkt_vec {
* @pseudo_random: FW initialized the pseudo-random machinery (CSRs)
* @queue_select: BPF can set the RX queue ID in packet vector
* @adjust_tail: BPF can simply trunc packet size for adjust tail
+ * @cmsg_multi_ent: FW can pack multiple map entries in a single cmsg
*/
struct nfp_app_bpf {
struct nfp_app *app;
@@ -134,6 +136,8 @@ struct nfp_app_bpf {
unsigned int cmsg_key_sz;
unsigned int cmsg_val_sz;
+ unsigned int cmsg_cache_cnt;
+
struct list_head map_list;
unsigned int maps_in_use;
unsigned int map_elems_in_use;
@@ -169,6 +173,7 @@ struct nfp_app_bpf {
bool pseudo_random;
bool queue_select;
bool adjust_tail;
+ bool cmsg_multi_ent;
};
enum nfp_bpf_map_use {
@@ -183,11 +188,21 @@ struct nfp_bpf_map_word {
unsigned char non_zero_update :1;
};
+#define NFP_BPF_MAP_CACHE_CNT 4U
+#define NFP_BPF_MAP_CACHE_TIME_NS (250 * 1000)
+
/**
* struct nfp_bpf_map - private per-map data attached to BPF maps for offload
* @offmap: pointer to the offloaded BPF map
* @bpf: back pointer to bpf app private structure
* @tid: table id identifying map on datapath
+ *
+ * @cache_lock: protects @cache_blockers, @cache_to, @cache
+ * @cache_blockers: number of ops in flight which block caching
+ * @cache_gen: counter incremented by every blocker on exit
+ * @cache_to: time when cache will no longer be valid (ns)
+ * @cache: skb with cached response
+ *
* @l: link on the nfp_app_bpf->map_list list
* @use_map: map of how the value is used (in 4B chunks)
*/
@@ -195,6 +210,13 @@ struct nfp_bpf_map {
struct bpf_offloaded_map *offmap;
struct nfp_app_bpf *bpf;
u32 tid;
+
+ spinlock_t cache_lock;
+ u32 cache_blockers;
+ u32 cache_gen;
+ u64 cache_to;
+ struct sk_buff *cache;
+
struct list_head l;
struct nfp_bpf_map_word use_map[];
};
@@ -564,7 +586,9 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv);
+unsigned int nfp_bpf_ctrl_cmsg_min_mtu(struct nfp_app_bpf *bpf);
unsigned int nfp_bpf_ctrl_cmsg_mtu(struct nfp_app_bpf *bpf);
+unsigned int nfp_bpf_ctrl_cmsg_cache_cnt(struct nfp_app_bpf *bpf);
long long int
nfp_bpf_ctrl_alloc_map(struct nfp_app_bpf *bpf, struct bpf_map *map);
void
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 39c9fec222b4..88fab6a82acf 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -385,6 +385,7 @@ nfp_bpf_map_alloc(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap)
offmap->dev_priv = nfp_map;
nfp_map->offmap = offmap;
nfp_map->bpf = bpf;
+ spin_lock_init(&nfp_map->cache_lock);
res = nfp_bpf_ctrl_alloc_map(bpf, &offmap->map);
if (res < 0) {
@@ -407,6 +408,8 @@ nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap)
struct nfp_bpf_map *nfp_map = offmap->dev_priv;
nfp_bpf_ctrl_free_map(bpf, nfp_map);
+ dev_consume_skb_any(nfp_map->cache);
+ WARN_ON_ONCE(nfp_map->cache_blockers);
list_del_init(&nfp_map->l);
bpf->map_elems_in_use -= offmap->map.max_entries;
bpf->maps_in_use--;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 5d6c3738b494..250f510b1d21 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -66,7 +66,7 @@
#define NFP_NET_MAX_DMA_BITS 40
/* Default size for MTU and freelist buffer sizes */
-#define NFP_NET_DEFAULT_MTU 1500
+#define NFP_NET_DEFAULT_MTU 1500U
/* Maximum number of bytes prepended to a packet */
#define NFP_NET_MAX_PREPEND 64
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 6f97b554f7da..61aabffc8888 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -4116,14 +4116,7 @@ int nfp_net_init(struct nfp_net *nn)
/* Set default MTU and Freelist buffer size */
if (!nfp_net_is_data_vnic(nn) && nn->app->ctrl_mtu) {
- if (nn->app->ctrl_mtu <= nn->max_mtu) {
- nn->dp.mtu = nn->app->ctrl_mtu;
- } else {
- if (nn->app->ctrl_mtu != NFP_APP_CTRL_MTU_MAX)
- nn_warn(nn, "app requested MTU above max supported %u > %u\n",
- nn->app->ctrl_mtu, nn->max_mtu);
- nn->dp.mtu = nn->max_mtu;
- }
+ nn->dp.mtu = min(nn->app->ctrl_mtu, nn->max_mtu);
} else if (nn->max_mtu < NFP_NET_DEFAULT_MTU) {
nn->dp.mtu = nn->max_mtu;
} else {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f9a506147c8a..5b9d22338606 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -24,6 +24,9 @@ struct seq_file;
struct btf;
struct btf_type;
+extern struct idr btf_idr;
+extern spinlock_t btf_idr_lock;
+
/* map is generic key/value storage optionally accesible by eBPF programs */
struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
@@ -647,6 +650,8 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
+struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map,
+ bool uref);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5fe99f322b1c..26a6d58ca78c 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -355,6 +355,7 @@ struct bpf_verifier_env {
struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
int stack_size; /* number of states to be processed */
bool strict_alignment; /* perform strict pointer alignment checks */
+ bool test_state_freq; /* test verifier with different pruning frequency */
struct bpf_verifier_state *cur_state; /* current verifier state */
struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
struct bpf_verifier_state_list *free_list;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b5d28dadf964..d7d5626002e9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -901,6 +901,10 @@ struct netdev_bpf {
};
};
+/* Flags for ndo_xsk_wakeup. */
+#define XDP_WAKEUP_RX (1 << 0)
+#define XDP_WAKEUP_TX (1 << 1)
+
#ifdef CONFIG_XFRM_OFFLOAD
struct xfrmdev_ops {
int (*xdo_dev_state_add) (struct xfrm_state *x);
@@ -1227,6 +1231,12 @@ struct tlsdev_ops;
* that got dropped are freed/returned via xdp_return_frame().
* Returns negative number, means general error invoking ndo, meaning
* no frames were xmit'ed and core-caller will free all frames.
+ * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
+ * This function is used to wake up the softirq, ksoftirqd or kthread
+ * responsible for sending and/or receiving packets on a specific
+ * queue id bound to an AF_XDP socket. The flags field specifies if
+ * only RX, only Tx, or both should be woken up using the flags
+ * XDP_WAKEUP_RX and XDP_WAKEUP_TX.
* struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev);
* Get devlink port instance associated with a given netdev.
* Called with a reference on the netdevice and devlink locks only,
@@ -1426,8 +1436,8 @@ struct net_device_ops {
int (*ndo_xdp_xmit)(struct net_device *dev, int n,
struct xdp_frame **xdp,
u32 flags);
- int (*ndo_xsk_async_xmit)(struct net_device *dev,
- u32 queue_id);
+ int (*ndo_xsk_wakeup)(struct net_device *dev,
+ u32 queue_id, u32 flags);
struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev);
};
diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index c7dc2b5902c0..c17af77f3fae 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -5,6 +5,10 @@
* propagate the unknown bits such that the tnum result represents all the
* possible results for possible values of the operands.
*/
+
+#ifndef _LINUX_TNUM_H
+#define _LINUX_TNUM_H
+
#include <linux/types.h>
struct tnum {
@@ -81,3 +85,5 @@ bool tnum_in(struct tnum a, struct tnum b);
int tnum_strn(char *str, size_t size, struct tnum a);
/* Format a tnum as tristate binary expansion */
int tnum_sbin(char *str, size_t size, struct tnum a);
+
+#endif /* _LINUX_TNUM_H */
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index b9dcb02e756b..8e4f831d2e52 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
extern const struct bpf_func_proto bpf_sk_storage_get_proto;
extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
+#ifdef CONFIG_BPF_SYSCALL
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
+#else
+static inline int bpf_sk_storage_clone(const struct sock *sk,
+ struct sock *newsk)
+{
+ return 0;
+}
+#endif
+
#endif /* _BPF_SK_STORAGE_H */
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 69796d264f06..c9398ce7960f 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -16,6 +16,13 @@
struct net_device;
struct xsk_queue;
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
+
struct xdp_umem_page {
void *addr;
dma_addr_t dma;
@@ -27,6 +34,13 @@ struct xdp_umem_fq_reuse {
u64 handles[];
};
+/* Flags for the umem flags field.
+ *
+ * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
+ * flags. See inlude/uapi/include/linux/if_xdp.h.
+ */
+#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
+
struct xdp_umem {
struct xsk_queue *fq;
struct xsk_queue *cq;
@@ -41,15 +55,27 @@ struct xdp_umem {
struct work_struct work;
struct page **pgs;
u32 npgs;
+ u16 queue_id;
+ u8 need_wakeup;
+ u8 flags;
int id;
struct net_device *dev;
struct xdp_umem_fq_reuse *fq_reuse;
- u16 queue_id;
bool zc;
spinlock_t xsk_list_lock;
struct list_head xsk_list;
};
+/* Nodes are linked in the struct xdp_sock map_list field, and used to
+ * track which maps a certain socket reside in.
+ */
+struct xsk_map;
+struct xsk_map_node {
+ struct list_head node;
+ struct xsk_map *map;
+ struct xdp_sock **map_entry;
+};
+
struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
@@ -75,6 +101,9 @@ struct xdp_sock {
/* Protects generic receive. */
spinlock_t rx_lock;
u64 rx_dropped;
+ struct list_head map_list;
+ /* Protects map_list */
+ spinlock_t map_list_lock;
};
struct xdp_buff;
@@ -95,15 +124,47 @@ struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
struct xdp_umem_fq_reuse *newq);
void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
+
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+ struct xdp_sock **map_entry);
+int xsk_map_inc(struct xsk_map *map);
+void xsk_map_put(struct xsk_map *map);
+
+static inline u64 xsk_umem_extract_addr(u64 addr)
+{
+ return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline u64 xsk_umem_extract_offset(u64 addr)
+{
+ return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
+{
+ return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
+}
static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
{
- return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
+ unsigned long page_addr;
+
+ addr = xsk_umem_add_offset_to_addr(addr);
+ page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
+
+ return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
}
static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
{
- return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+ addr = xsk_umem_add_offset_to_addr(addr);
+
+ return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
}
/* Reuse-queue aware version of FILL queue helpers */
@@ -144,6 +205,19 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
rq->handles[rq->length++] = addr;
}
+
+/* Handle the offset appropriately depending on aligned or unaligned mode.
+ * For unaligned mode, we store the offset in the upper 16-bits of the address.
+ * For aligned mode, we simply add the offset to the address.
+ */
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
+ u64 offset)
+{
+ if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
+ return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
+ else
+ return address + offset;
+}
#else
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
@@ -213,6 +287,21 @@ static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
return NULL;
}
+static inline u64 xsk_umem_extract_addr(u64 addr)
+{
+ return 0;
+}
+
+static inline u64 xsk_umem_extract_offset(u64 addr)
+{
+ return 0;
+}
+
+static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
+{
+ return 0;
+}
+
static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
{
return NULL;
@@ -241,6 +330,33 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
{
}
+static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+ return false;
+}
+
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
+ u64 offset)
+{
+ return 0;
+}
+
#endif /* CONFIG_XDP_SOCKETS */
#endif /* _LINUX_XDP_SOCK_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0e66371bea13..77c6be96d676 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -106,6 +106,7 @@ enum bpf_cmd {
BPF_TASK_FD_QUERY,
BPF_MAP_LOOKUP_AND_DELETE_ELEM,
BPF_MAP_FREEZE,
+ BPF_BTF_GET_NEXT_ID,
};
enum bpf_map_type {
@@ -284,6 +285,9 @@ enum bpf_attach_type {
*/
#define BPF_F_TEST_RND_HI32 (1U << 2)
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_STATE_FREQ (1U << 3)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
@@ -337,6 +341,9 @@ enum bpf_attach_type {
#define BPF_F_RDONLY_PROG (1U << 7)
#define BPF_F_WRONLY_PROG (1U << 8)
+/* Clone map from listener for newly accepted socket */
+#define BPF_F_CLONE (1U << 9)
+
/* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
@@ -576,6 +583,8 @@ union bpf_attr {
* limited to five).
*
* Each time the helper is called, it appends a line to the trace.
+ * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
* The format of the trace is customizable, and the exact output
* one will get depends on the options set in
* *\/sys/kernel/debug/tracing/trace_options* (see also the
@@ -1014,7 +1023,7 @@ union bpf_attr {
* The realm of the route for the packet associated to *skb*, or 0
* if none was found.
*
- * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* Description
* Write raw *data* blob into a special BPF perf event held by
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1076,7 +1085,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
* Description
* Walk a user or a kernel stack and return its id. To achieve
* this, the helper needs *ctx*, which is a pointer to the context
@@ -1725,7 +1734,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * int bpf_override_return(struct pt_regs *regs, u64 rc)
* Description
* Used for error injection, this helper uses kprobes to override
* the return value of the probed function, and to set it to *rc*.
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index faaa5ca2a117..be328c59389d 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -16,6 +16,18 @@
#define XDP_SHARED_UMEM (1 << 0)
#define XDP_COPY (1 << 1) /* Force copy-mode */
#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */
+/* If this option is set, the driver might go sleep and in that case
+ * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
+ * set. If it is set, the application need to explicitly wake up the
+ * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
+ * running the driver and the application on the same core, you should
+ * use this option so that the kernel will yield to the user space
+ * application.
+ */
+#define XDP_USE_NEED_WAKEUP (1 << 3)
+
+/* Flags for xsk_umem_config flags */
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
struct sockaddr_xdp {
__u16 sxdp_family;
@@ -25,10 +37,14 @@ struct sockaddr_xdp {
__u32 sxdp_shared_umem_fd;
};
+/* XDP_RING flags */
+#define XDP_RING_NEED_WAKEUP (1 << 0)
+
struct xdp_ring_offset {
__u64 producer;
__u64 consumer;
__u64 desc;
+ __u64 flags;
};
struct xdp_mmap_offsets {
@@ -53,6 +69,7 @@ struct xdp_umem_reg {
__u64 len; /* Length of packet data area */
__u32 chunk_size;
__u32 headroom;
+ __u32 flags;
};
struct xdp_statistics {
@@ -74,6 +91,11 @@ struct xdp_options {
#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL
#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL
+/* Masks for unaligned chunks mode */
+#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
+#define XSK_UNALIGNED_BUF_ADDR_MASK \
+ ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+
/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5fcc7a17eb5a..adb3adcebe3c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -195,8 +195,8 @@
i < btf_type_vlen(struct_type); \
i++, member++)
-static DEFINE_IDR(btf_idr);
-static DEFINE_SPINLOCK(btf_idr_lock);
+DEFINE_IDR(btf_idr);
+DEFINE_SPINLOCK(btf_idr_lock);
struct btf {
void *data;
@@ -3376,6 +3376,15 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
}
+#ifdef CONFIG_PROC_FS
+static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct btf *btf = filp->private_data;
+
+ seq_printf(m, "btf_id:\t%u\n", btf->id);
+}
+#endif
+
static int btf_release(struct inode *inode, struct file *filp)
{
btf_put(filp->private_data);
@@ -3383,6 +3392,9 @@ static int btf_release(struct inode *inode, struct file *filp)
}
const struct file_operations btf_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_btf_show_fdinfo,
+#endif
.release = btf_release,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 272071e9112f..82eabd4e38ad 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -683,8 +683,8 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
}
/* map_idr_lock should have been held */
-static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
- bool uref)
+static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map,
+ bool uref)
{
int refold;
@@ -704,6 +704,16 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
return map;
}
+struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+{
+ spin_lock_bh(&map_idr_lock);
+ map = __bpf_map_inc_not_zero(map, uref);
+ spin_unlock_bh(&map_idr_lock);
+
+ return map;
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
+
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -1619,6 +1629,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
+ BPF_F_TEST_STATE_FREQ |
BPF_F_TEST_RND_HI32))
return -EINVAL;
@@ -2183,7 +2194,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
- map = bpf_map_inc_not_zero(map, true);
+ map = __bpf_map_inc_not_zero(map, true);
else
map = ERR_PTR(-ENOENT);
spin_unlock_bh(&map_idr_lock);
@@ -2880,6 +2891,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_obj_get_next_id(&attr, uattr,
&map_idr, &map_idr_lock);
break;
+ case BPF_BTF_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &btf_idr, &btf_idr_lock);
+ break;
case BPF_PROG_GET_FD_BY_ID:
err = bpf_prog_get_fd_by_id(&attr);
break;
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 4659349fc795..7ae5dddd1fe6 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -30,17 +30,12 @@ static struct kobject *btf_kobj;
static int __init btf_vmlinux_init(void)
{
- int err;
-
if (!_binary__btf_vmlinux_bin_start)
return 0;
btf_kobj = kobject_create_and_add("btf", kernel_kobj);
- if (IS_ERR(btf_kobj)) {
- err = PTR_ERR(btf_kobj);
- btf_kobj = NULL;
- return err;
- }
+ if (!btf_kobj)
+ return -ENOMEM;
bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end -
_binary__btf_vmlinux_bin_start;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 16d66bd7af09..3fb50757e812 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7223,7 +7223,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
- bool add_new_state = false;
+ bool add_new_state = env->test_state_freq ? true : false;
cur->last_insn_idx = env->prev_insn_idx;
if (!env->insn_aux_data[insn_idx].prune_point)
@@ -9263,6 +9263,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->allow_ptr_leaks = is_priv;
+ if (is_priv)
+ env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 9bb96ace9fa1..942c662e2eed 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -13,8 +13,71 @@ struct xsk_map {
struct bpf_map map;
struct xdp_sock **xsk_map;
struct list_head __percpu *flush_list;
+ spinlock_t lock; /* Synchronize map updates */
};
+int xsk_map_inc(struct xsk_map *map)
+{
+ struct bpf_map *m = &map->map;
+
+ m = bpf_map_inc(m, false);
+ return PTR_ERR_OR_ZERO(m);
+}
+
+void xsk_map_put(struct xsk_map *map)
+{
+ bpf_map_put(&map->map);
+}
+
+static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
+ struct xdp_sock **map_entry)
+{
+ struct xsk_map_node *node;
+ int err;
+
+ node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
+ if (!node)
+ return NULL;
+
+ err = xsk_map_inc(map);
+ if (err) {
+ kfree(node);
+ return ERR_PTR(err);
+ }
+
+ node->map = map;
+ node->map_entry = map_entry;
+ return node;
+}
+
+static void xsk_map_node_free(struct xsk_map_node *node)
+{
+ xsk_map_put(node->map);
+ kfree(node);
+}
+
+static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
+{
+ spin_lock_bh(&xs->map_list_lock);
+ list_add_tail(&node->node, &xs->map_list);
+ spin_unlock_bh(&xs->map_list_lock);
+}
+
+static void xsk_map_sock_delete(struct xdp_sock *xs,
+ struct xdp_sock **map_entry)
+{
+ struct xsk_map_node *n, *tmp;
+
+ spin_lock_bh(&xs->map_list_lock);
+ list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
+ if (map_entry == n->map_entry) {
+ list_del(&n->node);
+ xsk_map_node_free(n);
+ }
+ }
+ spin_unlock_bh(&xs->map_list_lock);
+}
+
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
struct xsk_map *m;
@@ -34,6 +97,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&m->map, attr);
+ spin_lock_init(&m->lock);
cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
cost += sizeof(struct list_head) * num_possible_cpus();
@@ -71,21 +135,9 @@ free_m:
static void xsk_map_free(struct bpf_map *map)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
- int i;
bpf_clear_redirect_map(map);
synchronize_net();
-
- for (i = 0; i < map->max_entries; i++) {
- struct xdp_sock *xs;
-
- xs = m->xsk_map[i];
- if (!xs)
- continue;
-
- sock_put((struct sock *)xs);
- }
-
free_percpu(m->flush_list);
bpf_map_area_free(m->xsk_map);
kfree(m);
@@ -164,8 +216,9 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock *xs, *old_xs, **map_entry;
u32 i = *(u32 *)key, fd = *(u32 *)value;
- struct xdp_sock *xs, *old_xs;
+ struct xsk_map_node *node;
struct socket *sock;
int err;
@@ -173,8 +226,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EINVAL;
if (unlikely(i >= m->map.max_entries))
return -E2BIG;
- if (unlikely(map_flags == BPF_NOEXIST))
- return -EEXIST;
sock = sockfd_lookup(fd, &err);
if (!sock)
@@ -192,32 +243,70 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EOPNOTSUPP;
}
- sock_hold(sock->sk);
+ map_entry = &m->xsk_map[i];
+ node = xsk_map_node_alloc(m, map_entry);
+ if (IS_ERR(node)) {
+ sockfd_put(sock);
+ return PTR_ERR(node);
+ }
- old_xs = xchg(&m->xsk_map[i], xs);
+ spin_lock_bh(&m->lock);
+ old_xs = READ_ONCE(*map_entry);
+ if (old_xs == xs) {
+ err = 0;
+ goto out;
+ } else if (old_xs && map_flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto out;
+ } else if (!old_xs && map_flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto out;
+ }
+ xsk_map_sock_add(xs, node);
+ WRITE_ONCE(*map_entry, xs);
if (old_xs)
- sock_put((struct sock *)old_xs);
-
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
sockfd_put(sock);
return 0;
+
+out:
+ spin_unlock_bh(&m->lock);
+ sockfd_put(sock);
+ xsk_map_node_free(node);
+ return err;
}
static int xsk_map_delete_elem(struct bpf_map *map, void *key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *old_xs;
+ struct xdp_sock *old_xs, **map_entry;
int k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
- old_xs = xchg(&m->xsk_map[k], NULL);
+ spin_lock_bh(&m->lock);
+ map_entry = &m->xsk_map[k];
+ old_xs = xchg(map_entry, NULL);
if (old_xs)
- sock_put((struct sock *)old_xs);
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
return 0;
}
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+ struct xdp_sock **map_entry)
+{
+ spin_lock_bh(&map->lock);
+ if (READ_ONCE(*map_entry) == xs) {
+ WRITE_ONCE(*map_entry, NULL);
+ xsk_map_sock_delete(xs, map_entry);
+ }
+ spin_unlock_bh(&map->lock);
+}
+
const struct bpf_map_ops xsk_map_ops = {
.map_alloc = xsk_map_alloc,
.map_free = xsk_map_free,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 98da8998c25c..b09d7b1ffffd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -520,7 +520,8 @@ config BPF_EVENTS
bool
default y
help
- This allows the user to attach BPF programs to kprobe events.
+ This allows the user to attach BPF programs to kprobe, uprobe, and
+ tracepoint events.
config DYNAMIC_EVENTS
def_bool n
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index c41705835cba..5ef3eccee27c 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -867,7 +867,7 @@ static struct bpf_test tests[] = {
},
CLASSIC,
{ },
- { { 4, 10 ^ 300 }, { 20, 10 ^ 300 } },
+ { { 4, 0xA ^ 300 }, { 20, 0xA ^ 300 } },
},
{
"SPILL_FILL",
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 94c7f77ecb6b..da5639a5bd3b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -12,6 +12,9 @@
static atomic_t cache_idx;
+#define SK_STORAGE_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
struct bucket {
struct hlist_head list;
raw_spinlock_t lock;
@@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
kfree_rcu(sk_storage, rcu);
}
-/* sk_storage->lock must be held and sk_storage->list cannot be empty */
static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
struct bpf_sk_storage_elem *selem)
{
@@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
return 0;
}
-/* Called by __sk_destruct() */
+/* Called by __sk_destruct() & bpf_sk_storage_clone() */
void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_sk_storage_elem *selem;
@@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
smap = (struct bpf_sk_storage_map *)map;
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
synchronize_rcu();
/* bpf prog and the userspace can no longer access this map
@@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
{
- if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
+ if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->max_entries ||
attr->key_size != sizeof(int) || !attr->value_size ||
/* Enforce BTF for userspace sk dumping */
!attr->btf_key_type_id || !attr->btf_value_type_id)
@@ -739,6 +748,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
+static struct bpf_sk_storage_elem *
+bpf_sk_storage_clone_elem(struct sock *newsk,
+ struct bpf_sk_storage_map *smap,
+ struct bpf_sk_storage_elem *selem)
+{
+ struct bpf_sk_storage_elem *copy_selem;
+
+ copy_selem = selem_alloc(smap, newsk, NULL, true);
+ if (!copy_selem)
+ return NULL;
+
+ if (map_value_has_spin_lock(&smap->map))
+ copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data, true);
+ else
+ copy_map_value(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data);
+
+ return copy_selem;
+}
+
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
+{
+ struct bpf_sk_storage *new_sk_storage = NULL;
+ struct bpf_sk_storage *sk_storage;
+ struct bpf_sk_storage_elem *selem;
+ int ret = 0;
+
+ RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+
+ rcu_read_lock();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+
+ if (!sk_storage || hlist_empty(&sk_storage->list))
+ goto out;
+
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ struct bpf_sk_storage_elem *copy_selem;
+ struct bpf_sk_storage_map *smap;
+ struct bpf_map *map;
+
+ smap = rcu_dereference(SDATA(selem)->smap);
+ if (!(smap->map.map_flags & BPF_F_CLONE))
+ continue;
+
+ /* Note that for lockless listeners adding new element
+ * here can race with cleanup in bpf_sk_storage_map_free.
+ * Try to grab map refcnt to make sure that it's still
+ * alive and prevent concurrent removal.
+ */
+ map = bpf_map_inc_not_zero(&smap->map, false);
+ if (IS_ERR(map))
+ continue;
+
+ copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
+ if (!copy_selem) {
+ ret = -ENOMEM;
+ bpf_map_put(map);
+ goto out;
+ }
+
+ if (new_sk_storage) {
+ selem_link_map(smap, copy_selem);
+ __selem_link_sk(new_sk_storage, copy_selem);
+ } else {
+ ret = sk_storage_alloc(newsk, smap, copy_selem);
+ if (ret) {
+ kfree(copy_selem);
+ atomic_sub(smap->elem_size,
+ &newsk->sk_omem_alloc);
+ bpf_map_put(map);
+ goto out;
+ }
+
+ new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+ }
+ bpf_map_put(map);
+ }
+
+out:
+ rcu_read_unlock();
+
+ /* In case of an error, don't free anything explicitly here, the
+ * caller is responsible to call bpf_sk_storage_free.
+ */
+
+ return ret;
+}
+
BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{
diff --git a/net/core/dev.c b/net/core/dev.c
index 49589ed2018d..b1afafee3e2a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8126,12 +8126,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_chk = generic_xdp_install;
if (fd >= 0) {
+ u32 prog_id;
+
if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
return -EEXIST;
}
- if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
- __dev_xdp_query(dev, bpf_op, query)) {
+
+ prog_id = __dev_xdp_query(dev, bpf_op, query);
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
NL_SET_ERR_MSG(extack, "XDP program already attached");
return -EBUSY;
}
@@ -8146,6 +8149,14 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_prog_put(prog);
return -EINVAL;
}
+
+ if (prog->aux->id == prog_id) {
+ bpf_prog_put(prog);
+ return 0;
+ }
+ } else {
+ if (!__dev_xdp_query(dev, bpf_op, query))
+ return 0;
}
err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
diff --git a/net/core/filter.c b/net/core/filter.c
index b91988f8b94e..ed6563622ce3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5903,7 +5903,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
default:
return -EPROTONOSUPPORT;
}
- if (mss <= 0)
+ if (mss == 0)
return -ENOENT;
return cookie | ((u64)mss << 32);
diff --git a/net/core/sock.c b/net/core/sock.c
index 545fac19a711..07863edbe6fc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
goto out;
}
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
-#ifdef CONFIG_BPF_SYSCALL
- RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
-#endif
+
+ if (bpf_sk_storage_clone(sk, newsk)) {
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
newsk->sk_err = 0;
newsk->sk_err_soft = 0;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 0e0062127124..947b8ff0227e 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -14,7 +14,7 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/idr.h>
-#include <linux/highmem.h>
+#include <linux/vmalloc.h>
#include "xdp_umem.h"
#include "xsk_queue.h"
@@ -106,14 +106,22 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
umem->dev = dev;
umem->queue_id = queue_id;
+ if (flags & XDP_USE_NEED_WAKEUP) {
+ umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
+ /* Tx needs to be explicitly woken up the first time.
+ * Also for supporting drivers that do not implement this
+ * feature. They will always have to call sendto().
+ */
+ xsk_set_tx_need_wakeup(umem);
+ }
+
dev_hold(dev);
if (force_copy)
/* For copy-mode, we are done. */
return 0;
- if (!dev->netdev_ops->ndo_bpf ||
- !dev->netdev_ops->ndo_xsk_async_xmit) {
+ if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
err = -EOPNOTSUPP;
goto err_unreg_umem;
}
@@ -170,7 +178,30 @@ static void xdp_umem_unmap_pages(struct xdp_umem *umem)
unsigned int i;
for (i = 0; i < umem->npgs; i++)
- kunmap(umem->pgs[i]);
+ if (PageHighMem(umem->pgs[i]))
+ vunmap(umem->pages[i].addr);
+}
+
+static int xdp_umem_map_pages(struct xdp_umem *umem)
+{
+ unsigned int i;
+ void *addr;
+
+ for (i = 0; i < umem->npgs; i++) {
+ if (PageHighMem(umem->pgs[i]))
+ addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
+ else
+ addr = page_address(umem->pgs[i]);
+
+ if (!addr) {
+ xdp_umem_unmap_pages(umem);
+ return -ENOMEM;
+ }
+
+ umem->pages[i].addr = addr;
+ }
+
+ return 0;
}
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
@@ -309,10 +340,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
{
+ bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
unsigned int chunks, chunks_per_page;
u64 addr = mr->addr, size = mr->len;
- int size_chk, err, i;
+ int size_chk, err;
if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
/* Strictly speaking we could support this, if:
@@ -324,7 +356,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
return -EINVAL;
}
- if (!is_power_of_2(chunk_size))
+ if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
+ XDP_UMEM_USES_NEED_WAKEUP))
+ return -EINVAL;
+
+ if (!unaligned_chunks && !is_power_of_2(chunk_size))
return -EINVAL;
if (!PAGE_ALIGNED(addr)) {
@@ -341,9 +377,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (chunks == 0)
return -EINVAL;
- chunks_per_page = PAGE_SIZE / chunk_size;
- if (chunks < chunks_per_page || chunks % chunks_per_page)
- return -EINVAL;
+ if (!unaligned_chunks) {
+ chunks_per_page = PAGE_SIZE / chunk_size;
+ if (chunks < chunks_per_page || chunks % chunks_per_page)
+ return -EINVAL;
+ }
headroom = ALIGN(headroom, 64);
@@ -352,13 +390,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
return -EINVAL;
umem->address = (unsigned long)addr;
- umem->chunk_mask = ~((u64)chunk_size - 1);
+ umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
+ : ~((u64)chunk_size - 1);
umem->size = size;
umem->headroom = headroom;
umem->chunk_size_nohr = chunk_size - headroom;
umem->npgs = size / PAGE_SIZE;
umem->pgs = NULL;
umem->user = NULL;
+ umem->flags = mr->flags;
INIT_LIST_HEAD(&umem->xsk_list);
spin_lock_init(&umem->xsk_list_lock);
@@ -378,10 +418,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
goto out_pin;
}
- for (i = 0; i < umem->npgs; i++)
- umem->pages[i].addr = kmap(umem->pgs[i]);
+ err = xdp_umem_map_pages(umem);
+ if (!err)
+ return 0;
- return 0;
+ kfree(umem->pages);
out_pin:
xdp_umem_unpin_pages(umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 59b57d708697..c2f1af3b6a7c 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL(xsk_umem_has_addrs);
u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
- return xskq_peek_addr(umem->fq, addr);
+ return xskq_peek_addr(umem->fq, addr, umem);
}
EXPORT_SYMBOL(xsk_umem_peek_addr);
@@ -55,21 +55,103 @@ void xsk_umem_discard_addr(struct xdp_umem *umem)
}
EXPORT_SYMBOL(xsk_umem_discard_addr);
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+ if (umem->need_wakeup & XDP_WAKEUP_RX)
+ return;
+
+ umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
+ umem->need_wakeup |= XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
+
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+ struct xdp_sock *xs;
+
+ if (umem->need_wakeup & XDP_WAKEUP_TX)
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
+ }
+ rcu_read_unlock();
+
+ umem->need_wakeup |= XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
+
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+ if (!(umem->need_wakeup & XDP_WAKEUP_RX))
+ return;
+
+ umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+ umem->need_wakeup &= ~XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
+
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+ struct xdp_sock *xs;
+
+ if (!(umem->need_wakeup & XDP_WAKEUP_TX))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+ }
+ rcu_read_unlock();
+
+ umem->need_wakeup &= ~XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
+
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+ return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
+}
+EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
+
+/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
+ * each page. This is only required in copy mode.
+ */
+static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
+ u32 len, u32 metalen)
+{
+ void *to_buf = xdp_umem_get_data(umem, addr);
+
+ addr = xsk_umem_add_offset_to_addr(addr);
+ if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
+ void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
+ u64 page_start = addr & ~(PAGE_SIZE - 1);
+ u64 first_len = PAGE_SIZE - (addr - page_start);
+
+ memcpy(to_buf, from_buf, first_len + metalen);
+ memcpy(next_pg_addr, from_buf + first_len, len - first_len);
+
+ return;
+ }
+
+ memcpy(to_buf, from_buf, len + metalen);
+}
+
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- void *to_buf, *from_buf;
+ u64 offset = xs->umem->headroom;
+ u64 addr, memcpy_addr;
+ void *from_buf;
u32 metalen;
- u64 addr;
int err;
- if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+ if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
xs->rx_dropped++;
return -ENOSPC;
}
- addr += xs->umem->headroom;
-
if (unlikely(xdp_data_meta_unsupported(xdp))) {
from_buf = xdp->data;
metalen = 0;
@@ -78,9 +160,11 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
metalen = xdp->data - xdp->data_meta;
}
- to_buf = xdp_umem_get_data(xs->umem, addr);
- memcpy(to_buf, from_buf, len + metalen);
- addr += metalen;
+ memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
+ __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
+
+ offset += metalen;
+ addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
err = xskq_produce_batch_desc(xs->rx, addr, len);
if (!err) {
xskq_discard_addr(xs->umem->fq);
@@ -102,10 +186,23 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
return err;
}
+static bool xsk_is_bound(struct xdp_sock *xs)
+{
+ if (READ_ONCE(xs->state) == XSK_BOUND) {
+ /* Matches smp_wmb() in bind(). */
+ smp_rmb();
+ return true;
+ }
+ return false;
+}
+
int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
u32 len;
+ if (!xsk_is_bound(xs))
+ return -EINVAL;
+
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
return -EINVAL;
@@ -125,6 +222,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
u32 metalen = xdp->data - xdp->data_meta;
u32 len = xdp->data_end - xdp->data;
+ u64 offset = xs->umem->headroom;
void *buffer;
u64 addr;
int err;
@@ -136,17 +234,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
goto out_unlock;
}
- if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+ if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
err = -ENOSPC;
goto out_drop;
}
- addr += xs->umem->headroom;
-
+ addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
buffer = xdp_umem_get_data(xs->umem, addr);
memcpy(buffer, xdp->data_meta, len + metalen);
- addr += metalen;
+
+ addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
err = xskq_produce_batch_desc(xs->rx, addr, len);
if (err)
goto out_drop;
@@ -190,7 +288,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
- if (!xskq_peek_desc(xs->tx, desc))
+ if (!xskq_peek_desc(xs->tx, desc, umem))
continue;
if (xskq_produce_addr_lazy(umem->cq, desc->addr))
@@ -212,7 +310,8 @@ static int xsk_zc_xmit(struct sock *sk)
struct xdp_sock *xs = xdp_sk(sk);
struct net_device *dev = xs->dev;
- return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
+ return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
+ XDP_WAKEUP_TX);
}
static void xsk_destruct_skb(struct sk_buff *skb)
@@ -243,7 +342,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
- while (xskq_peek_desc(xs->tx, &desc)) {
+ while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
char *buffer;
u64 addr;
u32 len;
@@ -272,7 +371,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
skb->dev = xs->dev;
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
skb->destructor = xsk_destruct_skb;
err = dev_direct_xmit(skb, xs->queue_id);
@@ -301,7 +400,7 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
- if (unlikely(!xs->dev))
+ if (unlikely(!xsk_is_bound(xs)))
return -ENXIO;
if (unlikely(!(xs->dev->flags & IFF_UP)))
return -ENETDOWN;
@@ -317,8 +416,19 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait)
{
unsigned int mask = datagram_poll(file, sock, wait);
- struct sock *sk = sock->sk;
- struct xdp_sock *xs = xdp_sk(sk);
+ struct xdp_sock *xs = xdp_sk(sock->sk);
+ struct net_device *dev;
+ struct xdp_umem *umem;
+
+ if (unlikely(!xsk_is_bound(xs)))
+ return mask;
+
+ dev = xs->dev;
+ umem = xs->umem;
+
+ if (umem->need_wakeup)
+ dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
+ umem->need_wakeup);
if (xs->rx && !xskq_empty_desc(xs->rx))
mask |= POLLIN | POLLRDNORM;
@@ -342,7 +452,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
/* Make sure queue is ready before it can be seen by others */
smp_wmb();
- *queue = q;
+ WRITE_ONCE(*queue, q);
return 0;
}
@@ -350,10 +460,9 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
{
struct net_device *dev = xs->dev;
- if (!dev || xs->state != XSK_BOUND)
+ if (xs->state != XSK_BOUND)
return;
-
- xs->state = XSK_UNBOUND;
+ WRITE_ONCE(xs->state, XSK_UNBOUND);
/* Wait for driver to stop using the xdp socket. */
xdp_del_sk_umem(xs->umem, xs);
@@ -362,6 +471,52 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
dev_put(dev);
}
+static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
+ struct xdp_sock ***map_entry)
+{
+ struct xsk_map *map = NULL;
+ struct xsk_map_node *node;
+
+ *map_entry = NULL;
+
+ spin_lock_bh(&xs->map_list_lock);
+ node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
+ node);
+ if (node) {
+ WARN_ON(xsk_map_inc(node->map));
+ map = node->map;
+ *map_entry = node->map_entry;
+ }
+ spin_unlock_bh(&xs->map_list_lock);
+ return map;
+}
+
+static void xsk_delete_from_maps(struct xdp_sock *xs)
+{
+ /* This function removes the current XDP socket from all the
+ * maps it resides in. We need to take extra care here, due to
+ * the two locks involved. Each map has a lock synchronizing
+ * updates to the entries, and each socket has a lock that
+ * synchronizes access to the list of maps (map_list). For
+ * deadlock avoidance the locks need to be taken in the order
+ * "map lock"->"socket map list lock". We start off by
+ * accessing the socket map list, and take a reference to the
+ * map to guarantee existence between the
+ * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
+ * calls. Then we ask the map to remove the socket, which
+ * tries to remove the socket from the map. Note that there
+ * might be updates to the map between
+ * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
+ */
+ struct xdp_sock **map_entry = NULL;
+ struct xsk_map *map;
+
+ while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
+ xsk_map_try_sock_delete(map, xs, map_entry);
+ xsk_map_put(map);
+ }
+}
+
static int xsk_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -381,7 +536,10 @@ static int xsk_release(struct socket *sock)
sock_prot_inuse_add(net, sk->sk_prot, -1);
local_bh_enable();
+ xsk_delete_from_maps(xs);
+ mutex_lock(&xs->mutex);
xsk_unbind_dev(xs);
+ mutex_unlock(&xs->mutex);
xskq_destroy(xs->rx);
xskq_destroy(xs->tx);
@@ -412,6 +570,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
return sock;
}
+/* Check if umem pages are contiguous.
+ * If zero-copy mode, use the DMA address to do the page contiguity check
+ * For all other modes we use addr (kernel virtual address)
+ * Store the result in the low bits of addr.
+ */
+static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
+{
+ struct xdp_umem_page *pgs = umem->pages;
+ int i, is_contig;
+
+ for (i = 0; i < umem->npgs - 1; i++) {
+ is_contig = (flags & XDP_ZEROCOPY) ?
+ (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
+ (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
+ pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
+ }
+}
+
static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -427,7 +603,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
return -EINVAL;
flags = sxdp->sxdp_flags;
- if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
+ if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
+ XDP_USE_NEED_WAKEUP))
return -EINVAL;
rtnl_lock();
@@ -454,7 +631,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
struct xdp_sock *umem_xs;
struct socket *sock;
- if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+ if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
+ (flags & XDP_USE_NEED_WAKEUP)) {
/* Cannot specify flags for shared sockets. */
err = -EINVAL;
goto out_unlock;
@@ -473,19 +651,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
}
umem_xs = xdp_sk(sock->sk);
- if (!umem_xs->umem) {
- /* No umem to inherit. */
+ if (!xsk_is_bound(umem_xs)) {
err = -EBADF;
sockfd_put(sock);
goto out_unlock;
- } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
+ }
+ if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
err = -EINVAL;
sockfd_put(sock);
goto out_unlock;
}
xdp_get_umem(umem_xs->umem);
- xs->umem = umem_xs->umem;
+ WRITE_ONCE(xs->umem, umem_xs->umem);
sockfd_put(sock);
} else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
err = -EINVAL;
@@ -500,6 +678,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
if (err)
goto out_unlock;
+
+ xsk_check_page_contiguity(xs->umem, flags);
}
xs->dev = dev;
@@ -510,16 +690,28 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xdp_add_sk_umem(xs->umem, xs);
out_unlock:
- if (err)
+ if (err) {
dev_put(dev);
- else
- xs->state = XSK_BOUND;
+ } else {
+ /* Matches smp_rmb() in bind() for shared umem
+ * sockets, and xsk_is_bound().
+ */
+ smp_wmb();
+ WRITE_ONCE(xs->state, XSK_BOUND);
+ }
out_release:
mutex_unlock(&xs->mutex);
rtnl_unlock();
return err;
}
+struct xdp_umem_reg_v1 {
+ __u64 addr; /* Start of packet data area */
+ __u64 len; /* Length of packet data area */
+ __u32 chunk_size;
+ __u32 headroom;
+};
+
static int xsk_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@@ -549,15 +741,24 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
}
q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
err = xsk_init_queue(entries, q, false);
+ if (!err && optname == XDP_TX_RING)
+ /* Tx needs to be explicitly woken up the first time */
+ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
mutex_unlock(&xs->mutex);
return err;
}
case XDP_UMEM_REG:
{
- struct xdp_umem_reg mr;
+ size_t mr_size = sizeof(struct xdp_umem_reg);
+ struct xdp_umem_reg mr = {};
struct xdp_umem *umem;
- if (copy_from_user(&mr, optval, sizeof(mr)))
+ if (optlen < sizeof(struct xdp_umem_reg_v1))
+ return -EINVAL;
+ else if (optlen < sizeof(mr))
+ mr_size = sizeof(struct xdp_umem_reg_v1);
+
+ if (copy_from_user(&mr, optval, mr_size))
return -EFAULT;
mutex_lock(&xs->mutex);
@@ -574,7 +775,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
/* Make sure umem is ready before it can be seen by others */
smp_wmb();
- xs->umem = umem;
+ WRITE_ONCE(xs->umem, umem);
mutex_unlock(&xs->mutex);
return 0;
}
@@ -610,6 +811,20 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -ENOPROTOOPT;
}
+static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
+{
+ ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+ ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+ ring->desc = offsetof(struct xdp_rxtx_ring, desc);
+}
+
+static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
+{
+ ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+ ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+ ring->desc = offsetof(struct xdp_umem_ring, desc);
+}
+
static int xsk_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -649,26 +864,49 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
case XDP_MMAP_OFFSETS:
{
struct xdp_mmap_offsets off;
+ struct xdp_mmap_offsets_v1 off_v1;
+ bool flags_supported = true;
+ void *to_copy;
- if (len < sizeof(off))
+ if (len < sizeof(off_v1))
return -EINVAL;
+ else if (len < sizeof(off))
+ flags_supported = false;
+
+ if (flags_supported) {
+ /* xdp_ring_offset is identical to xdp_ring_offset_v1
+ * except for the flags field added to the end.
+ */
+ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+ &off.rx);
+ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+ &off.tx);
+ xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+ &off.fr);
+ xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+ &off.cr);
+ off.rx.flags = offsetof(struct xdp_rxtx_ring,
+ ptrs.flags);
+ off.tx.flags = offsetof(struct xdp_rxtx_ring,
+ ptrs.flags);
+ off.fr.flags = offsetof(struct xdp_umem_ring,
+ ptrs.flags);
+ off.cr.flags = offsetof(struct xdp_umem_ring,
+ ptrs.flags);
+
+ len = sizeof(off);
+ to_copy = &off;
+ } else {
+ xsk_enter_rxtx_offsets(&off_v1.rx);
+ xsk_enter_rxtx_offsets(&off_v1.tx);
+ xsk_enter_umem_offsets(&off_v1.fr);
+ xsk_enter_umem_offsets(&off_v1.cr);
+
+ len = sizeof(off_v1);
+ to_copy = &off_v1;
+ }
- off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
- off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
- off.rx.desc = offsetof(struct xdp_rxtx_ring, desc);
- off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
- off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
- off.tx.desc = offsetof(struct xdp_rxtx_ring, desc);
-
- off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
- off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
- off.fr.desc = offsetof(struct xdp_umem_ring, desc);
- off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
- off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
- off.cr.desc = offsetof(struct xdp_umem_ring, desc);
-
- len = sizeof(off);
- if (copy_to_user(optval, &off, len))
+ if (copy_to_user(optval, to_copy, len))
return -EFAULT;
if (put_user(len, optlen))
return -EFAULT;
@@ -713,7 +951,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long pfn;
struct page *qpg;
- if (xs->state != XSK_READY)
+ if (READ_ONCE(xs->state) != XSK_READY)
return -EBUSY;
if (offset == XDP_PGOFF_RX_RING) {
@@ -855,6 +1093,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
spin_lock_init(&xs->rx_lock);
spin_lock_init(&xs->tx_completion_lock);
+ INIT_LIST_HEAD(&xs->map_list);
+ spin_lock_init(&xs->map_list_lock);
+
mutex_lock(&net->xdp.lock);
sk_add_node_rcu(sk, &net->xdp.list);
mutex_unlock(&net->xdp.lock);
diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
index ba8120610426..4cfd106bdb53 100644
--- a/net/xdp/xsk.h
+++ b/net/xdp/xsk.h
@@ -4,6 +4,19 @@
#ifndef XSK_H_
#define XSK_H_
+struct xdp_ring_offset_v1 {
+ __u64 producer;
+ __u64 consumer;
+ __u64 desc;
+};
+
+struct xdp_mmap_offsets_v1 {
+ struct xdp_ring_offset_v1 rx;
+ struct xdp_ring_offset_v1 tx;
+ struct xdp_ring_offset_v1 fr;
+ struct xdp_ring_offset_v1 cr;
+};
+
static inline struct xdp_sock *xdp_sk(struct sock *sk)
{
return (struct xdp_sock *)sk;
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index d5e06c8e0cbf..f59791ba43a0 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
du.id = umem->id;
du.size = umem->size;
du.num_pages = umem->npgs;
- du.chunk_size = (__u32)(~umem->chunk_mask + 1);
+ du.chunk_size = umem->chunk_size_nohr + umem->headroom;
du.headroom = umem->headroom;
du.ifindex = umem->dev ? umem->dev->ifindex : 0;
du.queue_id = umem->queue_id;
@@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
msg->xdiag_ino = sk_ino;
sock_diag_save_cookie(sk, msg->xdiag_cookie);
+ mutex_lock(&xs->mutex);
if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
goto out_nlmsg_trim;
@@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
goto out_nlmsg_trim;
+ mutex_unlock(&xs->mutex);
nlmsg_end(nlskb, nlh);
return 0;
out_nlmsg_trim:
+ mutex_unlock(&xs->mutex);
nlmsg_cancel(nlskb, nlh);
return -EMSGSIZE;
}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 909c5168ed0f..eddae4688862 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -16,6 +16,7 @@
struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp;
u32 consumer ____cacheline_aligned_in_smp;
+ u32 flags;
};
/* Used for the RX and TX queues for packets */
@@ -133,6 +134,17 @@ static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
/* UMEM queue */
+static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr,
+ u64 length)
+{
+ bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
+ bool next_pg_contig =
+ (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
+ XSK_NEXT_PG_CONTIG_MASK;
+
+ return cross_pg && !next_pg_contig;
+}
+
static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
{
if (addr >= q->size) {
@@ -143,23 +155,51 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
return true;
}
-static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
+static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr,
+ u64 length,
+ struct xdp_umem *umem)
+{
+ u64 base_addr = xsk_umem_extract_addr(addr);
+
+ addr = xsk_umem_add_offset_to_addr(addr);
+ if (base_addr >= q->size || addr >= q->size ||
+ xskq_crosses_non_contig_pg(umem, addr, length)) {
+ q->invalid_descs++;
+ return false;
+ }
+
+ return true;
+}
+
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
+ struct xdp_umem *umem)
{
while (q->cons_tail != q->cons_head) {
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
unsigned int idx = q->cons_tail & q->ring_mask;
*addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
+
+ if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+ if (xskq_is_valid_addr_unaligned(q, *addr,
+ umem->chunk_size_nohr,
+ umem))
+ return addr;
+ goto out;
+ }
+
if (xskq_is_valid_addr(q, *addr))
return addr;
+out:
q->cons_tail++;
}
return NULL;
}
-static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
+ struct xdp_umem *umem)
{
if (q->cons_tail == q->cons_head) {
smp_mb(); /* D, matches A */
@@ -170,7 +210,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
smp_rmb();
}
- return xskq_validate_addr(q, addr);
+ return xskq_validate_addr(q, addr, umem);
}
static inline void xskq_discard_addr(struct xsk_queue *q)
@@ -229,8 +269,21 @@ static inline int xskq_reserve_addr(struct xsk_queue *q)
/* Rx/Tx queue */
-static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
+static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d,
+ struct xdp_umem *umem)
{
+ if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+ if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem))
+ return false;
+
+ if (d->len > umem->chunk_size_nohr || d->options) {
+ q->invalid_descs++;
+ return false;
+ }
+
+ return true;
+ }
+
if (!xskq_is_valid_addr(q, d->addr))
return false;
@@ -244,14 +297,15 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
}
static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
- struct xdp_desc *desc)
+ struct xdp_desc *desc,
+ struct xdp_umem *umem)
{
while (q->cons_tail != q->cons_head) {
struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
unsigned int idx = q->cons_tail & q->ring_mask;
*desc = READ_ONCE(ring->desc[idx]);
- if (xskq_is_valid_desc(q, desc))
+ if (xskq_is_valid_desc(q, desc, umem))
return desc;
q->cons_tail++;
@@ -261,7 +315,8 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
}
static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
- struct xdp_desc *desc)
+ struct xdp_desc *desc,
+ struct xdp_umem *umem)
{
if (q->cons_tail == q->cons_head) {
smp_mb(); /* D, matches A */
@@ -272,7 +327,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
smp_rmb(); /* C, matches B */
}
- return xskq_validate_desc(q, desc);
+ return xskq_validate_desc(q, desc, umem);
}
static inline void xskq_discard_desc(struct xsk_queue *q)
diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c
index 516e255cbe8f..88f940052450 100644
--- a/samples/bpf/syscall_nrs.c
+++ b/samples/bpf/syscall_nrs.c
@@ -9,5 +9,11 @@ void syscall_defines(void)
COMMENT("Linux system call numbers.");
SYSNR(__NR_write);
SYSNR(__NR_read);
+#ifdef __NR_mmap2
+ SYSNR(__NR_mmap2);
+#endif
+#ifdef __NR_mmap
SYSNR(__NR_mmap);
+#endif
+
}
diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c
index f57f4e1ea1ec..35cb0eed3be5 100644
--- a/samples/bpf/tracex5_kern.c
+++ b/samples/bpf/tracex5_kern.c
@@ -68,12 +68,25 @@ PROG(SYS__NR_read)(struct pt_regs *ctx)
return 0;
}
+#ifdef __NR_mmap2
+PROG(SYS__NR_mmap2)(struct pt_regs *ctx)
+{
+ char fmt[] = "mmap2\n";
+
+ bpf_trace_printk(fmt, sizeof(fmt));
+ return 0;
+}
+#endif
+
+#ifdef __NR_mmap
PROG(SYS__NR_mmap)(struct pt_regs *ctx)
{
char fmt[] = "mmap\n";
+
bpf_trace_printk(fmt, sizeof(fmt));
return 0;
}
+#endif
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 93eaaf7239b2..102eace22956 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -67,8 +67,14 @@ static int opt_ifindex;
static int opt_queue;
static int opt_poll;
static int opt_interval = 1;
+static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
+static u32 opt_umem_flags;
+static int opt_unaligned_chunks;
+static int opt_mmap_flags;
static u32 opt_xdp_bind_flags;
static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+static int opt_timeout = 1000;
+static bool opt_need_wakeup = true;
static __u32 prog_id;
struct xsk_umem_info {
@@ -282,7 +288,9 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = opt_xsk_frame_size,
.frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
+ .flags = opt_umem_flags
};
+
int ret;
umem = calloc(1, sizeof(*umem));
@@ -291,6 +299,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
&cfg);
+
if (ret)
exit_with_error(-ret);
@@ -352,6 +361,8 @@ static struct option long_options[] = {
{"zero-copy", no_argument, 0, 'z'},
{"copy", no_argument, 0, 'c'},
{"frame-size", required_argument, 0, 'f'},
+ {"no-need-wakeup", no_argument, 0, 'm'},
+ {"unaligned", no_argument, 0, 'u'},
{0, 0, 0, 0}
};
@@ -372,6 +383,9 @@ static void usage(const char *prog)
" -z, --zero-copy Force zero-copy mode.\n"
" -c, --copy Force copy mode.\n"
" -f, --frame-size=n Set the frame size (must be a power of two, default is %d).\n"
+ " -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
+ " -f, --frame-size=n Set the frame size (must be a power of two in aligned mode, default is %d).\n"
+ " -u, --unaligned Enable unaligned chunk placement\n"
"\n";
fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
exit(EXIT_FAILURE);
@@ -384,8 +398,8 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
- c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
- &option_index);
+ c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:mu",
+ long_options, &option_index);
if (c == -1)
break;
@@ -424,12 +438,21 @@ static void parse_command_line(int argc, char **argv)
case 'c':
opt_xdp_bind_flags |= XDP_COPY;
break;
+ case 'u':
+ opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+ opt_unaligned_chunks = 1;
+ opt_mmap_flags = MAP_HUGETLB;
+ break;
case 'F':
opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
break;
case 'f':
opt_xsk_frame_size = atoi(optarg);
+ case 'm':
+ opt_need_wakeup = false;
+ opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
break;
+
default:
usage(basename(argv[0]));
}
@@ -442,7 +465,8 @@ static void parse_command_line(int argc, char **argv)
usage(basename(argv[0]));
}
- if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) {
+ if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) &&
+ !opt_unaligned_chunks) {
fprintf(stderr, "--frame-size=%d is not a power of two\n",
opt_xsk_frame_size);
usage(basename(argv[0]));
@@ -459,8 +483,10 @@ static void kick_tx(struct xsk_socket_info *xsk)
exit_with_error(errno);
}
-static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
+static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
+ struct pollfd *fds)
{
+ struct xsk_umem_info *umem = xsk->umem;
u32 idx_cq = 0, idx_fq = 0;
unsigned int rcvd;
size_t ndescs;
@@ -468,27 +494,30 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
if (!xsk->outstanding_tx)
return;
- kick_tx(xsk);
+ if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+ kick_tx(xsk);
+
ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
xsk->outstanding_tx;
/* re-add completed Tx buffers */
- rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq);
+ rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq);
if (rcvd > 0) {
unsigned int i;
int ret;
- ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+ ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
- ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
- &idx_fq);
+ if (xsk_ring_prod__needs_wakeup(&umem->fq))
+ ret = poll(fds, num_socks, opt_timeout);
+ ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
}
+
for (i = 0; i < rcvd; i++)
- *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
- *xsk_ring_cons__comp_addr(&xsk->umem->cq,
- idx_cq++);
+ *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) =
+ *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++);
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
xsk_ring_cons__release(&xsk->umem->cq, rcvd);
@@ -505,7 +534,8 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
if (!xsk->outstanding_tx)
return;
- kick_tx(xsk);
+ if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+ kick_tx(xsk);
rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
if (rcvd > 0) {
@@ -515,30 +545,38 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
}
}
-static void rx_drop(struct xsk_socket_info *xsk)
+static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
{
unsigned int rcvd, i;
u32 idx_rx = 0, idx_fq = 0;
int ret;
rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
- if (!rcvd)
+ if (!rcvd) {
+ if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+ ret = poll(fds, num_socks, opt_timeout);
return;
+ }
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
+ if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+ ret = poll(fds, num_socks, opt_timeout);
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
}
for (i = 0; i < rcvd; i++) {
u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
+ u64 orig = xsk_umem__extract_addr(addr);
+
+ addr = xsk_umem__add_offset_to_addr(addr);
char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
hex_dump(pkt, len, addr);
- *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr;
+ *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
}
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
@@ -549,42 +587,65 @@ static void rx_drop(struct xsk_socket_info *xsk)
static void rx_drop_all(void)
{
struct pollfd fds[MAX_SOCKS + 1];
- int i, ret, timeout, nfds = 1;
+ int i, ret;
memset(fds, 0, sizeof(fds));
for (i = 0; i < num_socks; i++) {
fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
fds[i].events = POLLIN;
- timeout = 1000; /* 1sn */
}
for (;;) {
if (opt_poll) {
- ret = poll(fds, nfds, timeout);
+ ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
}
for (i = 0; i < num_socks; i++)
- rx_drop(xsks[i]);
+ rx_drop(xsks[i], fds);
+ }
+}
+
+static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb)
+{
+ u32 idx;
+
+ if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == BATCH_SIZE) {
+ unsigned int i;
+
+ for (i = 0; i < BATCH_SIZE; i++) {
+ xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr =
+ (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
+ xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
+ sizeof(pkt_data) - 1;
+ }
+
+ xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
+ xsk->outstanding_tx += BATCH_SIZE;
+ frame_nb += BATCH_SIZE;
+ frame_nb %= NUM_FRAMES;
}
+
+ complete_tx_only(xsk);
}
-static void tx_only(struct xsk_socket_info *xsk)
+static void tx_only_all(void)
{
- int timeout, ret, nfds = 1;
- struct pollfd fds[nfds + 1];
- u32 idx, frame_nb = 0;
+ struct pollfd fds[MAX_SOCKS];
+ u32 frame_nb[MAX_SOCKS] = {};
+ int i, ret;
memset(fds, 0, sizeof(fds));
- fds[0].fd = xsk_socket__fd(xsk->xsk);
- fds[0].events = POLLOUT;
- timeout = 1000; /* 1sn */
+ for (i = 0; i < num_socks; i++) {
+ fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
+ fds[0].events = POLLOUT;
+ }
for (;;) {
if (opt_poll) {
- ret = poll(fds, nfds, timeout);
+ ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
@@ -592,69 +653,78 @@ static void tx_only(struct xsk_socket_info *xsk)
continue;
}
- if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
- BATCH_SIZE) {
- unsigned int i;
-
- for (i = 0; i < BATCH_SIZE; i++) {
- xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
- = (frame_nb + i) * opt_xsk_frame_size;
- xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
- sizeof(pkt_data) - 1;
- }
-
- xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
- xsk->outstanding_tx += BATCH_SIZE;
- frame_nb += BATCH_SIZE;
- frame_nb %= NUM_FRAMES;
- }
-
- complete_tx_only(xsk);
+ for (i = 0; i < num_socks; i++)
+ tx_only(xsks[i], frame_nb[i]);
}
}
-static void l2fwd(struct xsk_socket_info *xsk)
+static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
{
- for (;;) {
- unsigned int rcvd, i;
- u32 idx_rx = 0, idx_tx = 0;
- int ret;
+ unsigned int rcvd, i;
+ u32 idx_rx = 0, idx_tx = 0;
+ int ret;
- for (;;) {
- complete_tx_l2fwd(xsk);
+ complete_tx_l2fwd(xsk, fds);
- rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
- &idx_rx);
- if (rcvd > 0)
- break;
- }
+ rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
+ if (!rcvd) {
+ if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+ ret = poll(fds, num_socks, opt_timeout);
+ return;
+ }
+ ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+ while (ret != rcvd) {
+ if (ret < 0)
+ exit_with_error(-ret);
+ if (xsk_ring_prod__needs_wakeup(&xsk->tx))
+ kick_tx(xsk);
ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
- while (ret != rcvd) {
- if (ret < 0)
- exit_with_error(-ret);
- ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
- }
+ }
- for (i = 0; i < rcvd; i++) {
- u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
- idx_rx)->addr;
- u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
- idx_rx++)->len;
- char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
+ for (i = 0; i < rcvd; i++) {
+ u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
+ u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
+ u64 orig = xsk_umem__extract_addr(addr);
- swap_mac_addresses(pkt);
+ addr = xsk_umem__add_offset_to_addr(addr);
+ char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
- hex_dump(pkt, len, addr);
- xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
- xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
- }
+ swap_mac_addresses(pkt);
+
+ hex_dump(pkt, len, addr);
+ xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig;
+ xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
+ }
+
+ xsk_ring_prod__submit(&xsk->tx, rcvd);
+ xsk_ring_cons__release(&xsk->rx, rcvd);
+
+ xsk->rx_npkts += rcvd;
+ xsk->outstanding_tx += rcvd;
+}
- xsk_ring_prod__submit(&xsk->tx, rcvd);
- xsk_ring_cons__release(&xsk->rx, rcvd);
+static void l2fwd_all(void)
+{
+ struct pollfd fds[MAX_SOCKS];
+ int i, ret;
+
+ memset(fds, 0, sizeof(fds));
- xsk->rx_npkts += rcvd;
- xsk->outstanding_tx += rcvd;
+ for (i = 0; i < num_socks; i++) {
+ fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
+ fds[i].events = POLLOUT | POLLIN;
+ }
+
+ for (;;) {
+ if (opt_poll) {
+ ret = poll(fds, num_socks, opt_timeout);
+ if (ret <= 0)
+ continue;
+ }
+
+ for (i = 0; i < num_socks; i++)
+ l2fwd(xsks[i], fds);
}
}
@@ -674,11 +744,14 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
- ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
- NUM_FRAMES * opt_xsk_frame_size);
- if (ret)
- exit_with_error(ret);
-
+ /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */
+ bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0);
+ if (bufs == MAP_FAILED) {
+ printf("ERROR: mmap failed\n");
+ exit(EXIT_FAILURE);
+ }
/* Create sockets... */
umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
xsks[num_socks++] = xsk_configure_socket(umem);
@@ -705,9 +778,9 @@ int main(int argc, char **argv)
if (opt_bench == BENCH_RXDROP)
rx_drop_all();
else if (opt_bench == BENCH_TXONLY)
- tx_only(xsks[0]);
+ tx_only_all();
else
- l2fwd(xsks[0]);
+ l2fwd_all();
return 0;
}
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index c31193340108..0d8f41db8cd6 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -115,10 +115,12 @@ gen_btf()
LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${1}
# dump .BTF section into raw binary file to link with final vmlinux
- bin_arch=$(${OBJDUMP} -f ${1} | grep architecture | \
+ bin_arch=$(LANG=C ${OBJDUMP} -f ${1} | grep architecture | \
cut -d, -f1 | cut -d' ' -f2)
+ bin_format=$(LANG=C ${OBJDUMP} -f ${1} | grep 'file format' | \
+ awk '{print $4}')
${OBJCOPY} --dump-section .BTF=.btf.vmlinux.bin ${1} 2>/dev/null
- ${OBJCOPY} -I binary -O ${CONFIG_OUTPUT_FORMAT} -B ${bin_arch} \
+ ${OBJCOPY} -I binary -O ${bin_format} -B ${bin_arch} \
--rename-section .data=.BTF .btf.vmlinux.bin ${2}
}
diff --git a/tools/bpf/.gitignore b/tools/bpf/.gitignore
index dfe2bd5a4b95..59024197e71d 100644
--- a/tools/bpf/.gitignore
+++ b/tools/bpf/.gitignore
@@ -1,4 +1,5 @@
FEATURE-DUMP.bpf
+feature
bpf_asm
bpf_dbg
bpf_exp.yacc.*
diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile
index 53b60ad452f5..fbf5e4a0cb9c 100644
--- a/tools/bpf/Makefile
+++ b/tools/bpf/Makefile
@@ -81,10 +81,11 @@ $(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.lex.c
clean: bpftool_clean
$(call QUIET_CLEAN, bpf-progs)
- $(Q)rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
+ $(Q)$(RM) -r -- $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
$(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.*
$(call QUIET_CLEAN, core-gen)
- $(Q)rm -f $(OUTPUT)FEATURE-DUMP.bpf
+ $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpf
+ $(Q)$(RM) -r -- $(OUTPUT)feature
install: $(PROGS) bpftool_install
$(call QUIET_INSTALL, bpf_jit_disasm)
diff --git a/tools/bpf/bpftool/.gitignore b/tools/bpf/bpftool/.gitignore
index 8248b8dd89d4..b13926432b84 100644
--- a/tools/bpf/bpftool/.gitignore
+++ b/tools/bpf/bpftool/.gitignore
@@ -3,3 +3,5 @@
bpftool*.8
bpf-helpers.*
FEATURE-DUMP.bpftool
+feature
+libbpf
diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 6694a0fc8f99..39615f8e145b 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -19,6 +19,7 @@ SYNOPSIS
BTF COMMANDS
=============
+| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*]
| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*]
| **bpftool** **btf help**
|
@@ -29,6 +30,12 @@ BTF COMMANDS
DESCRIPTION
===========
+ **bpftool btf { show | list }** [**id** *BTF_ID*]
+ Show information about loaded BTF objects. If a BTF ID is
+ specified, show information only about given BTF object,
+ otherwise list all BTF objects currently loaded on the
+ system.
+
**bpftool btf dump** *BTF_SRC*
Dump BTF entries from a given *BTF_SRC*.
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 61d1d270eb5e..1c0f7146aab0 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -36,6 +36,7 @@ MAP COMMANDS
| **bpftool** **map pop** *MAP*
| **bpftool** **map enqueue** *MAP* **value** *VALUE*
| **bpftool** **map dequeue** *MAP*
+| **bpftool** **map freeze** *MAP*
| **bpftool** **map help**
|
| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@@ -127,6 +128,14 @@ DESCRIPTION
**bpftool map dequeue** *MAP*
Dequeue and print **value** from the queue.
+ **bpftool map freeze** *MAP*
+ Freeze the map as read-only from user space. Entries from a
+ frozen map can not longer be updated or deleted with the
+ **bpf\ ()** system call. This operation is not reversible,
+ and the map remains immutable from user space until its
+ destruction. However, read and write permissions for BPF
+ programs to the map remain unchanged.
+
**bpftool map help**
Print short help message.
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index d8e5237a2085..8651b00b81ea 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -15,17 +15,22 @@ SYNOPSIS
*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
*COMMANDS* :=
- { **show** | **list** } [ **dev** name ] | **help**
+ { **show** | **list** | **attach** | **detach** | **help** }
NET COMMANDS
============
-| **bpftool** **net { show | list } [ dev name ]**
+| **bpftool** **net { show | list }** [ **dev** *NAME* ]
+| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
+| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
| **bpftool** **net help**
+|
+| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** }
DESCRIPTION
===========
- **bpftool net { show | list } [ dev name ]**
+ **bpftool net { show | list }** [ **dev** *NAME* ]
List bpf program attachments in the kernel networking subsystem.
Currently, only device driver xdp attachments and tc filter
@@ -47,6 +52,24 @@ DESCRIPTION
all bpf programs attached to non clsact qdiscs, and finally all
bpf programs attached to root and clsact qdisc.
+ **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
+ Attach bpf program *PROG* to network interface *NAME* with
+ type specified by *ATTACH_TYPE*. Previously attached bpf program
+ can be replaced by the command used with **overwrite** option.
+ Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
+
+ *ATTACH_TYPE* can be of:
+ **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it;
+ **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb;
+ **xdpdrv** - Native XDP. runs earliest point in driver's receive path;
+ **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception;
+
+ **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
+ Detach bpf program attached to network interface *NAME* with
+ type specified by *ATTACH_TYPE*. To detach bpf program, same
+ *ATTACH_TYPE* previously used for attach must be specified.
+ Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
+
**bpftool net help**
Print short help message.
@@ -137,6 +160,34 @@ EXAMPLES
}
]
+|
+| **# bpftool net attach xdpdrv id 16 dev enp6s0np0**
+| **# bpftool net**
+
+::
+
+ xdp:
+ enp6s0np0(4) driver id 16
+
+|
+| **# bpftool net attach xdpdrv id 16 dev enp6s0np0**
+| **# bpftool net attach xdpdrv id 20 dev enp6s0np0 overwrite**
+| **# bpftool net**
+
+::
+
+ xdp:
+ enp6s0np0(4) driver id 20
+
+|
+| **# bpftool net attach xdpdrv id 16 dev enp6s0np0**
+| **# bpftool net detach xdpdrv dev enp6s0np0**
+| **# bpftool net**
+
+::
+
+ xdp:
+
SEE ALSO
========
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 4c9d1ffc3fc7..39bc6f0f4f0b 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -17,27 +17,30 @@ endif
BPF_DIR = $(srctree)/tools/lib/bpf/
ifneq ($(OUTPUT),)
- BPF_PATH = $(OUTPUT)
+ LIBBPF_OUTPUT = $(OUTPUT)/libbpf/
+ LIBBPF_PATH = $(LIBBPF_OUTPUT)
else
- BPF_PATH = $(BPF_DIR)
+ LIBBPF_PATH = $(BPF_DIR)
endif
-LIBBPF = $(BPF_PATH)libbpf.a
+LIBBPF = $(LIBBPF_PATH)libbpf.a
-BPFTOOL_VERSION := $(shell make --no-print-directory -sC ../../.. kernelversion)
+BPFTOOL_VERSION := $(shell make -rR --no-print-directory -sC ../../.. kernelversion)
$(LIBBPF): FORCE
- $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) $(OUTPUT)libbpf.a
+ $(if $(LIBBPF_OUTPUT),@mkdir -p $(LIBBPF_OUTPUT))
+ $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) $(LIBBPF_OUTPUT)libbpf.a
$(LIBBPF)-clean:
$(call QUIET_CLEAN, libbpf)
- $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) clean >/dev/null
+ $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) clean >/dev/null
prefix ?= /usr/local
bash_compdir ?= /usr/share/bash-completion/completions
CFLAGS += -O2
-CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers
+CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers
+CFLAGS += $(filter-out -Wswitch-enum,$(EXTRA_WARNINGS))
CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
-I$(srctree)/kernel/bpf/ \
-I$(srctree)/tools/include \
@@ -52,7 +55,7 @@ ifneq ($(EXTRA_LDFLAGS),)
LDFLAGS += $(EXTRA_LDFLAGS)
endif
-LIBS = -lelf -lz $(LIBBPF)
+LIBS = $(LIBBPF) -lelf -lz
INSTALL ?= install
RM ?= rm -f
@@ -114,16 +117,18 @@ $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
$(OUTPUT)feature.o: | zdep
$(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
- $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
+ $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJS) $(LIBS)
$(OUTPUT)%.o: %.c
$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
clean: $(LIBBPF)-clean
$(call QUIET_CLEAN, bpftool)
- $(Q)$(RM) $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+ $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+ $(Q)$(RM) -r -- $(OUTPUT)libbpf/
$(call QUIET_CLEAN, core-gen)
- $(Q)$(RM) $(OUTPUT)FEATURE-DUMP.bpftool
+ $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool
+ $(Q)$(RM) -r -- $(OUTPUT)feature/
install: $(OUTPUT)bpftool
$(call QUIET_INSTALL, bpftool)
@@ -134,8 +139,8 @@ install: $(OUTPUT)bpftool
uninstall:
$(call QUIET_UNINST, bpftool)
- $(Q)$(RM) $(DESTDIR)$(prefix)/sbin/bpftool
- $(Q)$(RM) $(DESTDIR)$(bash_compdir)/bpftool
+ $(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool
+ $(Q)$(RM) -- $(DESTDIR)$(bash_compdir)/bpftool
doc:
$(call descend,Documentation)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index df16c5415444..70493a6da206 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -73,8 +73,8 @@ _bpftool_get_prog_tags()
_bpftool_get_btf_ids()
{
- COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
- command sed -n 's/.*"btf_id": \(.*\),\?$/\1/p' )" -- "$cur" ) )
+ COMPREPLY+=( $( compgen -W "$( bpftool -jp btf 2>&1 | \
+ command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
}
_bpftool_get_obj_map_names()
@@ -201,6 +201,10 @@ _bpftool()
_bpftool_get_prog_tags
return 0
;;
+ dev)
+ _sysfs_get_netdevs
+ return 0
+ ;;
file|pinned)
_filedir
return 0
@@ -399,10 +403,6 @@ _bpftool()
_filedir
return 0
;;
- dev)
- _sysfs_get_netdevs
- return 0
- ;;
*)
COMPREPLY=( $( compgen -W "map" -- "$cur" ) )
_bpftool_once_attr 'type'
@@ -449,7 +449,7 @@ _bpftool()
map)
local MAP_TYPE='id pinned'
case $command in
- show|list|dump|peek|pop|dequeue)
+ show|list|dump|peek|pop|dequeue|freeze)
case $prev in
$command)
COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
@@ -498,10 +498,6 @@ _bpftool()
key|value|flags|name|entries)
return 0
;;
- dev)
- _sysfs_get_netdevs
- return 0
- ;;
*)
_bpftool_once_attr 'type'
_bpftool_once_attr 'key'
@@ -642,7 +638,7 @@ _bpftool()
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'delete dump getnext help \
lookup pin event_pipe show list update create \
- peek push enqueue pop dequeue' -- \
+ peek push enqueue pop dequeue freeze' -- \
"$cur" ) )
;;
esac
@@ -674,7 +670,7 @@ _bpftool()
map)
_bpftool_get_map_ids
;;
- dump)
+ $command)
_bpftool_get_btf_ids
;;
esac
@@ -702,9 +698,21 @@ _bpftool()
;;
esac
;;
+ show|list)
+ case $prev in
+ $command)
+ COMPREPLY+=( $( compgen -W "id" -- "$cur" ) )
+ ;;
+ id)
+ _bpftool_get_btf_ids
+ ;;
+ esac
+ return 0
+ ;;
*)
[[ $prev == $object ]] && \
- COMPREPLY=( $( compgen -W 'dump help' -- "$cur" ) )
+ COMPREPLY=( $( compgen -W 'dump help show list' \
+ -- "$cur" ) )
;;
esac
;;
@@ -778,18 +786,67 @@ _bpftool()
esac
;;
net)
+ local PROG_TYPE='id pinned tag'
+ local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload'
case $command in
+ show|list)
+ [[ $prev != "$command" ]] && return 0
+ COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
+ return 0
+ ;;
+ attach)
+ case $cword in
+ 3)
+ COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- "$cur" ) )
+ return 0
+ ;;
+ 4)
+ COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) )
+ return 0
+ ;;
+ 5)
+ case $prev in
+ id)
+ _bpftool_get_prog_ids
+ ;;
+ pinned)
+ _filedir
+ ;;
+ esac
+ return 0
+ ;;
+ 6)
+ COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
+ return 0
+ ;;
+ 8)
+ _bpftool_once_attr 'overwrite'
+ return 0
+ ;;
+ esac
+ ;;
+ detach)
+ case $cword in
+ 3)
+ COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- "$cur" ) )
+ return 0
+ ;;
+ 4)
+ COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
+ return 0
+ ;;
+ esac
+ ;;
*)
[[ $prev == $object ]] && \
COMPREPLY=( $( compgen -W 'help \
- show list' -- "$cur" ) )
+ show list attach detach' -- "$cur" ) )
;;
esac
;;
feature)
case $command in
probe)
- [[ $prev == "dev" ]] && _sysfs_get_netdevs && return 0
[[ $prev == "prefix" ]] && return 0
if _bpftool_search_list 'macros'; then
COMPREPLY+=( $( compgen -W 'prefix' -- "$cur" ) )
diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 1b8ec91899e6..9a9376d1d3df 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -11,6 +11,7 @@
#include <bpf.h>
#include <libbpf.h>
#include <linux/btf.h>
+#include <linux/hashtable.h>
#include "btf.h"
#include "json_writer.h"
@@ -35,6 +36,16 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = "DATASEC",
};
+struct btf_attach_table {
+ DECLARE_HASHTABLE(table, 16);
+};
+
+struct btf_attach_point {
+ __u32 obj_id;
+ __u32 btf_id;
+ struct hlist_node hash;
+};
+
static const char *btf_int_enc_str(__u8 encoding)
{
switch (encoding) {
@@ -449,7 +460,7 @@ static int do_dump(int argc, char **argv)
btf_id = strtoul(*argv, &endptr, 0);
if (*endptr) {
- p_err("can't parse %s as ID", **argv);
+ p_err("can't parse %s as ID", *argv);
return -1;
}
NEXT_ARG();
@@ -522,6 +533,330 @@ done:
return err;
}
+static int btf_parse_fd(int *argc, char ***argv)
+{
+ unsigned int id;
+ char *endptr;
+ int fd;
+
+ if (!is_prefix(*argv[0], "id")) {
+ p_err("expected 'id', got: '%s'?", **argv);
+ return -1;
+ }
+ NEXT_ARGP();
+
+ id = strtoul(**argv, &endptr, 0);
+ if (*endptr) {
+ p_err("can't parse %s as ID", **argv);
+ return -1;
+ }
+ NEXT_ARGP();
+
+ fd = bpf_btf_get_fd_by_id(id);
+ if (fd < 0)
+ p_err("can't get BTF object by id (%u): %s",
+ id, strerror(errno));
+
+ return fd;
+}
+
+static void delete_btf_table(struct btf_attach_table *tab)
+{
+ struct btf_attach_point *obj;
+ struct hlist_node *tmp;
+
+ unsigned int bkt;
+
+ hash_for_each_safe(tab->table, bkt, tmp, obj, hash) {
+ hash_del(&obj->hash);
+ free(obj);
+ }
+}
+
+static int
+build_btf_type_table(struct btf_attach_table *tab, enum bpf_obj_type type,
+ void *info, __u32 *len)
+{
+ static const char * const names[] = {
+ [BPF_OBJ_UNKNOWN] = "unknown",
+ [BPF_OBJ_PROG] = "prog",
+ [BPF_OBJ_MAP] = "map",
+ };
+ struct btf_attach_point *obj_node;
+ __u32 btf_id, id = 0;
+ int err;
+ int fd;
+
+ while (true) {
+ switch (type) {
+ case BPF_OBJ_PROG:
+ err = bpf_prog_get_next_id(id, &id);
+ break;
+ case BPF_OBJ_MAP:
+ err = bpf_map_get_next_id(id, &id);
+ break;
+ default:
+ err = -1;
+ p_err("unexpected object type: %d", type);
+ goto err_free;
+ }
+ if (err) {
+ if (errno == ENOENT) {
+ err = 0;
+ break;
+ }
+ p_err("can't get next %s: %s%s", names[type],
+ strerror(errno),
+ errno == EINVAL ? " -- kernel too old?" : "");
+ goto err_free;
+ }
+
+ switch (type) {
+ case BPF_OBJ_PROG:
+ fd = bpf_prog_get_fd_by_id(id);
+ break;
+ case BPF_OBJ_MAP:
+ fd = bpf_map_get_fd_by_id(id);
+ break;
+ default:
+ err = -1;
+ p_err("unexpected object type: %d", type);
+ goto err_free;
+ }
+ if (fd < 0) {
+ if (errno == ENOENT)
+ continue;
+ p_err("can't get %s by id (%u): %s", names[type], id,
+ strerror(errno));
+ err = -1;
+ goto err_free;
+ }
+
+ memset(info, 0, *len);
+ err = bpf_obj_get_info_by_fd(fd, info, len);
+ close(fd);
+ if (err) {
+ p_err("can't get %s info: %s", names[type],
+ strerror(errno));
+ goto err_free;
+ }
+
+ switch (type) {
+ case BPF_OBJ_PROG:
+ btf_id = ((struct bpf_prog_info *)info)->btf_id;
+ break;
+ case BPF_OBJ_MAP:
+ btf_id = ((struct bpf_map_info *)info)->btf_id;
+ break;
+ default:
+ err = -1;
+ p_err("unexpected object type: %d", type);
+ goto err_free;
+ }
+ if (!btf_id)
+ continue;
+
+ obj_node = calloc(1, sizeof(*obj_node));
+ if (!obj_node) {
+ p_err("failed to allocate memory: %s", strerror(errno));
+ goto err_free;
+ }
+
+ obj_node->obj_id = id;
+ obj_node->btf_id = btf_id;
+ hash_add(tab->table, &obj_node->hash, obj_node->btf_id);
+ }
+
+ return 0;
+
+err_free:
+ delete_btf_table(tab);
+ return err;
+}
+
+static int
+build_btf_tables(struct btf_attach_table *btf_prog_table,
+ struct btf_attach_table *btf_map_table)
+{
+ struct bpf_prog_info prog_info;
+ __u32 prog_len = sizeof(prog_info);
+ struct bpf_map_info map_info;
+ __u32 map_len = sizeof(map_info);
+ int err = 0;
+
+ err = build_btf_type_table(btf_prog_table, BPF_OBJ_PROG, &prog_info,
+ &prog_len);
+ if (err)
+ return err;
+
+ err = build_btf_type_table(btf_map_table, BPF_OBJ_MAP, &map_info,
+ &map_len);
+ if (err) {
+ delete_btf_table(btf_prog_table);
+ return err;
+ }
+
+ return 0;
+}
+
+static void
+show_btf_plain(struct bpf_btf_info *info, int fd,
+ struct btf_attach_table *btf_prog_table,
+ struct btf_attach_table *btf_map_table)
+{
+ struct btf_attach_point *obj;
+ int n;
+
+ printf("%u: ", info->id);
+ printf("size %uB", info->btf_size);
+
+ n = 0;
+ hash_for_each_possible(btf_prog_table->table, obj, hash, info->id) {
+ if (obj->btf_id == info->id)
+ printf("%s%u", n++ == 0 ? " prog_ids " : ",",
+ obj->obj_id);
+ }
+
+ n = 0;
+ hash_for_each_possible(btf_map_table->table, obj, hash, info->id) {
+ if (obj->btf_id == info->id)
+ printf("%s%u", n++ == 0 ? " map_ids " : ",",
+ obj->obj_id);
+ }
+
+ printf("\n");
+}
+
+static void
+show_btf_json(struct bpf_btf_info *info, int fd,
+ struct btf_attach_table *btf_prog_table,
+ struct btf_attach_table *btf_map_table)
+{
+ struct btf_attach_point *obj;
+
+ jsonw_start_object(json_wtr); /* btf object */
+ jsonw_uint_field(json_wtr, "id", info->id);
+ jsonw_uint_field(json_wtr, "size", info->btf_size);
+
+ jsonw_name(json_wtr, "prog_ids");
+ jsonw_start_array(json_wtr); /* prog_ids */
+ hash_for_each_possible(btf_prog_table->table, obj, hash,
+ info->id) {
+ if (obj->btf_id == info->id)
+ jsonw_uint(json_wtr, obj->obj_id);
+ }
+ jsonw_end_array(json_wtr); /* prog_ids */
+
+ jsonw_name(json_wtr, "map_ids");
+ jsonw_start_array(json_wtr); /* map_ids */
+ hash_for_each_possible(btf_map_table->table, obj, hash,
+ info->id) {
+ if (obj->btf_id == info->id)
+ jsonw_uint(json_wtr, obj->obj_id);
+ }
+ jsonw_end_array(json_wtr); /* map_ids */
+ jsonw_end_object(json_wtr); /* btf object */
+}
+
+static int
+show_btf(int fd, struct btf_attach_table *btf_prog_table,
+ struct btf_attach_table *btf_map_table)
+{
+ struct bpf_btf_info info = {};
+ __u32 len = sizeof(info);
+ int err;
+
+ err = bpf_obj_get_info_by_fd(fd, &info, &len);
+ if (err) {
+ p_err("can't get BTF object info: %s", strerror(errno));
+ return -1;
+ }
+
+ if (json_output)
+ show_btf_json(&info, fd, btf_prog_table, btf_map_table);
+ else
+ show_btf_plain(&info, fd, btf_prog_table, btf_map_table);
+
+ return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+ struct btf_attach_table btf_prog_table;
+ struct btf_attach_table btf_map_table;
+ int err, fd = -1;
+ __u32 id = 0;
+
+ if (argc == 2) {
+ fd = btf_parse_fd(&argc, &argv);
+ if (fd < 0)
+ return -1;
+ }
+
+ if (argc) {
+ if (fd >= 0)
+ close(fd);
+ return BAD_ARG();
+ }
+
+ hash_init(btf_prog_table.table);
+ hash_init(btf_map_table.table);
+ err = build_btf_tables(&btf_prog_table, &btf_map_table);
+ if (err) {
+ if (fd >= 0)
+ close(fd);
+ return err;
+ }
+
+ if (fd >= 0) {
+ err = show_btf(fd, &btf_prog_table, &btf_map_table);
+ close(fd);
+ goto exit_free;
+ }
+
+ if (json_output)
+ jsonw_start_array(json_wtr); /* root array */
+
+ while (true) {
+ err = bpf_btf_get_next_id(id, &id);
+ if (err) {
+ if (errno == ENOENT) {
+ err = 0;
+ break;
+ }
+ p_err("can't get next BTF object: %s%s",
+ strerror(errno),
+ errno == EINVAL ? " -- kernel too old?" : "");
+ err = -1;
+ break;
+ }
+
+ fd = bpf_btf_get_fd_by_id(id);
+ if (fd < 0) {
+ if (errno == ENOENT)
+ continue;
+ p_err("can't get BTF object by id (%u): %s",
+ id, strerror(errno));
+ err = -1;
+ break;
+ }
+
+ err = show_btf(fd, &btf_prog_table, &btf_map_table);
+ close(fd);
+ if (err)
+ break;
+ }
+
+ if (json_output)
+ jsonw_end_array(json_wtr); /* root array */
+
+exit_free:
+ delete_btf_table(&btf_prog_table);
+ delete_btf_table(&btf_map_table);
+
+ return err;
+}
+
static int do_help(int argc, char **argv)
{
if (json_output) {
@@ -530,7 +865,8 @@ static int do_help(int argc, char **argv)
}
fprintf(stderr,
- "Usage: %s btf dump BTF_SRC [format FORMAT]\n"
+ "Usage: %s btf { show | list } [id BTF_ID]\n"
+ " %s btf dump BTF_SRC [format FORMAT]\n"
" %s btf help\n"
"\n"
" BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n"
@@ -539,12 +875,14 @@ static int do_help(int argc, char **argv)
" " HELP_SPEC_PROGRAM "\n"
" " HELP_SPEC_OPTIONS "\n"
"",
- bin_name, bin_name);
+ bin_name, bin_name, bin_name);
return 0;
}
static const struct cmd cmds[] = {
+ { "show", do_show },
+ { "list", do_show },
{ "help", do_help },
{ "dump", do_dump },
{ 0 }
diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c
index 8cafb9b31467..d66131f69689 100644
--- a/tools/bpf/bpftool/btf_dumper.c
+++ b/tools/bpf/bpftool/btf_dumper.c
@@ -26,9 +26,9 @@ static void btf_dumper_ptr(const void *data, json_writer_t *jw,
bool is_plain_text)
{
if (is_plain_text)
- jsonw_printf(jw, "%p", *(unsigned long *)data);
+ jsonw_printf(jw, "%p", data);
else
- jsonw_printf(jw, "%u", *(unsigned long *)data);
+ jsonw_printf(jw, "%lu", *(unsigned long *)data);
}
static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id,
@@ -216,7 +216,7 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset,
switch (BTF_INT_ENCODING(*int_type)) {
case 0:
if (BTF_INT_BITS(*int_type) == 64)
- jsonw_printf(jw, "%lu", *(__u64 *)data);
+ jsonw_printf(jw, "%llu", *(__u64 *)data);
else if (BTF_INT_BITS(*int_type) == 32)
jsonw_printf(jw, "%u", *(__u32 *)data);
else if (BTF_INT_BITS(*int_type) == 16)
@@ -229,7 +229,7 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset,
break;
case BTF_INT_SIGNED:
if (BTF_INT_BITS(*int_type) == 64)
- jsonw_printf(jw, "%ld", *(long long *)data);
+ jsonw_printf(jw, "%lld", *(long long *)data);
else if (BTF_INT_BITS(*int_type) == 32)
jsonw_printf(jw, "%d", *(int *)data);
else if (BTF_INT_BITS(*int_type) == 16)
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 44352b5aca85..1ef45e55039e 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -120,8 +120,8 @@ static int count_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type)
static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
int level)
{
+ const char *attach_flags_str;
__u32 prog_ids[1024] = {0};
- char *attach_flags_str;
__u32 prog_cnt, iter;
__u32 attach_flags;
char buf[32];
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 6a71324be628..88264abaa738 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -29,7 +29,7 @@
#define BPF_FS_MAGIC 0xcafe4a11
#endif
-void __printf(1, 2) p_err(const char *fmt, ...)
+void p_err(const char *fmt, ...)
{
va_list ap;
@@ -47,7 +47,7 @@ void __printf(1, 2) p_err(const char *fmt, ...)
va_end(ap);
}
-void __printf(1, 2) p_info(const char *fmt, ...)
+void p_info(const char *fmt, ...)
{
va_list ap;
diff --git a/tools/bpf/bpftool/json_writer.c b/tools/bpf/bpftool/json_writer.c
index 6046dcab51cc..86501cd3c763 100644
--- a/tools/bpf/bpftool/json_writer.c
+++ b/tools/bpf/bpftool/json_writer.c
@@ -15,7 +15,6 @@
#include <malloc.h>
#include <inttypes.h>
#include <stdint.h>
-#include <linux/compiler.h>
#include "json_writer.h"
@@ -153,8 +152,7 @@ void jsonw_name(json_writer_t *self, const char *name)
putc(' ', self->out);
}
-void __printf(2, 0)
-jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap)
+void jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap)
{
jsonw_eor(self);
putc('"', self->out);
@@ -162,7 +160,7 @@ jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap)
putc('"', self->out);
}
-void __printf(2, 3) jsonw_printf(json_writer_t *self, const char *fmt, ...)
+void jsonw_printf(json_writer_t *self, const char *fmt, ...)
{
va_list ap;
diff --git a/tools/bpf/bpftool/json_writer.h b/tools/bpf/bpftool/json_writer.h
index cb9a1993681c..35cf1f00f96c 100644
--- a/tools/bpf/bpftool/json_writer.h
+++ b/tools/bpf/bpftool/json_writer.h
@@ -14,6 +14,7 @@
#include <stdbool.h>
#include <stdint.h>
#include <stdarg.h>
+#include <linux/compiler.h>
/* Opaque class structure */
typedef struct json_writer json_writer_t;
@@ -30,8 +31,9 @@ void jsonw_pretty(json_writer_t *self, bool on);
void jsonw_name(json_writer_t *self, const char *name);
/* Add value */
-void jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap);
-void jsonw_printf(json_writer_t *self, const char *fmt, ...);
+void __printf(2, 0) jsonw_vprintf_enquote(json_writer_t *self, const char *fmt,
+ va_list ap);
+void __printf(2, 3) jsonw_printf(json_writer_t *self, const char *fmt, ...);
void jsonw_string(json_writer_t *self, const char *value);
void jsonw_bool(json_writer_t *self, bool value);
void jsonw_float(json_writer_t *self, double number);
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index e916ff25697f..93d008687020 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -139,7 +139,7 @@ int detect_common_prefix(const char *arg, ...)
strncat(msg, "'", sizeof(msg) - strlen(msg) - 1);
if (count >= 2) {
- p_err(msg);
+ p_err("%s", msg);
return -1;
}
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 7031a4bf87a0..af9ad56c303a 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -98,8 +98,8 @@ extern int bpf_flags;
extern struct pinned_obj_table prog_table;
extern struct pinned_obj_table map_table;
-void p_err(const char *fmt, ...);
-void p_info(const char *fmt, ...);
+void __printf(1, 2) p_err(const char *fmt, ...);
+void __printf(1, 2) p_info(const char *fmt, ...);
bool is_prefix(const char *pfx, const char *str);
int detect_common_prefix(const char *arg, ...);
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index bfbbc6b4cb83..de61d73b9030 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -481,9 +481,11 @@ static int parse_elem(char **argv, struct bpf_map_info *info,
static int show_map_close_json(int fd, struct bpf_map_info *info)
{
- char *memlock;
+ char *memlock, *frozen_str;
+ int frozen = 0;
memlock = get_fdinfo(fd, "memlock");
+ frozen_str = get_fdinfo(fd, "frozen");
jsonw_start_object(json_wtr);
@@ -533,6 +535,12 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
}
close(fd);
+ if (frozen_str) {
+ frozen = atoi(frozen_str);
+ free(frozen_str);
+ }
+ jsonw_int_field(json_wtr, "frozen", frozen);
+
if (info->btf_id)
jsonw_int_field(json_wtr, "btf_id", info->btf_id);
@@ -555,9 +563,11 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
static int show_map_close_plain(int fd, struct bpf_map_info *info)
{
- char *memlock;
+ char *memlock, *frozen_str;
+ int frozen = 0;
memlock = get_fdinfo(fd, "memlock");
+ frozen_str = get_fdinfo(fd, "frozen");
printf("%u: ", info->id);
if (info->type < ARRAY_SIZE(map_type_name))
@@ -610,9 +620,23 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info)
printf("\n\tpinned %s", obj->path);
}
}
+ printf("\n");
+
+ if (frozen_str) {
+ frozen = atoi(frozen_str);
+ free(frozen_str);
+ }
+
+ if (!info->btf_id && !frozen)
+ return 0;
+
+ printf("\t");
if (info->btf_id)
- printf("\n\tbtf_id %d", info->btf_id);
+ printf("btf_id %d", info->btf_id);
+
+ if (frozen)
+ printf("%sfrozen", info->btf_id ? " " : "");
printf("\n");
return 0;
@@ -1238,6 +1262,35 @@ exit_free:
return err;
}
+static int do_freeze(int argc, char **argv)
+{
+ int err, fd;
+
+ if (!REQ_ARGS(2))
+ return -1;
+
+ fd = map_parse_fd(&argc, &argv);
+ if (fd < 0)
+ return -1;
+
+ if (argc) {
+ close(fd);
+ return BAD_ARG();
+ }
+
+ err = bpf_map_freeze(fd);
+ close(fd);
+ if (err) {
+ p_err("failed to freeze map: %s", strerror(errno));
+ return err;
+ }
+
+ if (json_output)
+ jsonw_null(json_wtr);
+
+ return 0;
+}
+
static int do_help(int argc, char **argv)
{
if (json_output) {
@@ -1262,6 +1315,7 @@ static int do_help(int argc, char **argv)
" %s %s pop MAP\n"
" %s %s enqueue MAP value VALUE\n"
" %s %s dequeue MAP\n"
+ " %s %s freeze MAP\n"
" %s %s help\n"
"\n"
" " HELP_SPEC_MAP "\n"
@@ -1280,7 +1334,8 @@ static int do_help(int argc, char **argv)
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
- bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
+ bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+ bin_name, argv[-2]);
return 0;
}
@@ -1302,6 +1357,7 @@ static const struct cmd cmds[] = {
{ "enqueue", do_update },
{ "pop", do_pop_dequeue },
{ "dequeue", do_pop_dequeue },
+ { "freeze", do_freeze },
{ 0 }
};
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
index 3f108ab17797..4c5531d1a450 100644
--- a/tools/bpf/bpftool/map_perf_ring.c
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -157,7 +157,7 @@ int do_event_pipe(int argc, char **argv)
NEXT_ARG();
ctx.cpu = strtoul(*argv, &endptr, 0);
if (*endptr) {
- p_err("can't parse %s as CPU ID", **argv);
+ p_err("can't parse %s as CPU ID", *argv);
goto err_close_map;
}
@@ -168,7 +168,7 @@ int do_event_pipe(int argc, char **argv)
NEXT_ARG();
ctx.idx = strtoul(*argv, &endptr, 0);
if (*endptr) {
- p_err("can't parse %s as index", **argv);
+ p_err("can't parse %s as index", *argv);
goto err_close_map;
}
diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c
index 67e99c56bc88..4f52d3151616 100644
--- a/tools/bpf/bpftool/net.c
+++ b/tools/bpf/bpftool/net.c
@@ -55,6 +55,35 @@ struct bpf_attach_info {
__u32 flow_dissector_id;
};
+enum net_attach_type {
+ NET_ATTACH_TYPE_XDP,
+ NET_ATTACH_TYPE_XDP_GENERIC,
+ NET_ATTACH_TYPE_XDP_DRIVER,
+ NET_ATTACH_TYPE_XDP_OFFLOAD,
+};
+
+static const char * const attach_type_strings[] = {
+ [NET_ATTACH_TYPE_XDP] = "xdp",
+ [NET_ATTACH_TYPE_XDP_GENERIC] = "xdpgeneric",
+ [NET_ATTACH_TYPE_XDP_DRIVER] = "xdpdrv",
+ [NET_ATTACH_TYPE_XDP_OFFLOAD] = "xdpoffload",
+};
+
+const size_t net_attach_type_size = ARRAY_SIZE(attach_type_strings);
+
+static enum net_attach_type parse_attach_type(const char *str)
+{
+ enum net_attach_type type;
+
+ for (type = 0; type < net_attach_type_size; type++) {
+ if (attach_type_strings[type] &&
+ is_prefix(str, attach_type_strings[type]))
+ return type;
+ }
+
+ return net_attach_type_size;
+}
+
static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
{
struct bpf_netdev_t *netinfo = cookie;
@@ -197,7 +226,7 @@ static int query_flow_dissector(struct bpf_attach_info *attach_info)
fd = open("/proc/self/ns/net", O_RDONLY);
if (fd < 0) {
- p_err("can't open /proc/self/ns/net: %d",
+ p_err("can't open /proc/self/ns/net: %s",
strerror(errno));
return -1;
}
@@ -223,6 +252,134 @@ static int query_flow_dissector(struct bpf_attach_info *attach_info)
return 0;
}
+static int net_parse_dev(int *argc, char ***argv)
+{
+ int ifindex;
+
+ if (is_prefix(**argv, "dev")) {
+ NEXT_ARGP();
+
+ ifindex = if_nametoindex(**argv);
+ if (!ifindex)
+ p_err("invalid devname %s", **argv);
+
+ NEXT_ARGP();
+ } else {
+ p_err("expected 'dev', got: '%s'?", **argv);
+ return -1;
+ }
+
+ return ifindex;
+}
+
+static int do_attach_detach_xdp(int progfd, enum net_attach_type attach_type,
+ int ifindex, bool overwrite)
+{
+ __u32 flags = 0;
+
+ if (!overwrite)
+ flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+ if (attach_type == NET_ATTACH_TYPE_XDP_GENERIC)
+ flags |= XDP_FLAGS_SKB_MODE;
+ if (attach_type == NET_ATTACH_TYPE_XDP_DRIVER)
+ flags |= XDP_FLAGS_DRV_MODE;
+ if (attach_type == NET_ATTACH_TYPE_XDP_OFFLOAD)
+ flags |= XDP_FLAGS_HW_MODE;
+
+ return bpf_set_link_xdp_fd(ifindex, progfd, flags);
+}
+
+static int do_attach(int argc, char **argv)
+{
+ enum net_attach_type attach_type;
+ int progfd, ifindex, err = 0;
+ bool overwrite = false;
+
+ /* parse attach args */
+ if (!REQ_ARGS(5))
+ return -EINVAL;
+
+ attach_type = parse_attach_type(*argv);
+ if (attach_type == net_attach_type_size) {
+ p_err("invalid net attach/detach type: %s", *argv);
+ return -EINVAL;
+ }
+ NEXT_ARG();
+
+ progfd = prog_parse_fd(&argc, &argv);
+ if (progfd < 0)
+ return -EINVAL;
+
+ ifindex = net_parse_dev(&argc, &argv);
+ if (ifindex < 1) {
+ close(progfd);
+ return -EINVAL;
+ }
+
+ if (argc) {
+ if (is_prefix(*argv, "overwrite")) {
+ overwrite = true;
+ } else {
+ p_err("expected 'overwrite', got: '%s'?", *argv);
+ close(progfd);
+ return -EINVAL;
+ }
+ }
+
+ /* attach xdp prog */
+ if (is_prefix("xdp", attach_type_strings[attach_type]))
+ err = do_attach_detach_xdp(progfd, attach_type, ifindex,
+ overwrite);
+
+ if (err < 0) {
+ p_err("interface %s attach failed: %s",
+ attach_type_strings[attach_type], strerror(-err));
+ return err;
+ }
+
+ if (json_output)
+ jsonw_null(json_wtr);
+
+ return 0;
+}
+
+static int do_detach(int argc, char **argv)
+{
+ enum net_attach_type attach_type;
+ int progfd, ifindex, err = 0;
+
+ /* parse detach args */
+ if (!REQ_ARGS(3))
+ return -EINVAL;
+
+ attach_type = parse_attach_type(*argv);
+ if (attach_type == net_attach_type_size) {
+ p_err("invalid net attach/detach type: %s", *argv);
+ return -EINVAL;
+ }
+ NEXT_ARG();
+
+ ifindex = net_parse_dev(&argc, &argv);
+ if (ifindex < 1)
+ return -EINVAL;
+
+ /* detach xdp prog */
+ progfd = -1;
+ if (is_prefix("xdp", attach_type_strings[attach_type]))
+ err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL);
+
+ if (err < 0) {
+ p_err("interface %s detach failed: %s",
+ attach_type_strings[attach_type], strerror(-err));
+ return err;
+ }
+
+ if (json_output)
+ jsonw_null(json_wtr);
+
+ return 0;
+}
+
static int do_show(int argc, char **argv)
{
struct bpf_attach_info attach_info = {};
@@ -232,13 +389,9 @@ static int do_show(int argc, char **argv)
char err_buf[256];
if (argc == 2) {
- if (strcmp(argv[0], "dev") != 0)
- usage();
- filter_idx = if_nametoindex(argv[1]);
- if (filter_idx == 0) {
- fprintf(stderr, "invalid dev name %s\n", argv[1]);
+ filter_idx = net_parse_dev(&argc, &argv);
+ if (filter_idx < 1)
return -1;
- }
} else if (argc != 0) {
usage();
}
@@ -305,13 +458,20 @@ static int do_help(int argc, char **argv)
fprintf(stderr,
"Usage: %s %s { show | list } [dev <devname>]\n"
+ " %s %s attach ATTACH_TYPE PROG dev <devname> [ overwrite ]\n"
+ " %s %s detach ATTACH_TYPE dev <devname>\n"
" %s %s help\n"
+ "\n"
+ " " HELP_SPEC_PROGRAM "\n"
+ " ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n"
+ "\n"
"Note: Only xdp and tc attachments are supported now.\n"
" For progs attached to cgroups, use \"bpftool cgroup\"\n"
" to dump program attachments. For program types\n"
" sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n"
" consult iproute2.\n",
- bin_name, argv[-2], bin_name, argv[-2]);
+ bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+ bin_name, argv[-2]);
return 0;
}
@@ -319,6 +479,8 @@ static int do_help(int argc, char **argv)
static const struct cmd cmds[] = {
{ "show", do_show },
{ "list", do_show },
+ { "attach", do_attach },
+ { "detach", do_detach },
{ "help", do_help },
{ 0 }
};
diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
index f2a545e667c4..b2046f33e23f 100644
--- a/tools/bpf/bpftool/perf.c
+++ b/tools/bpf/bpftool/perf.c
@@ -104,6 +104,8 @@ static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type,
jsonw_string_field(json_wtr, "filename", buf);
jsonw_lluint_field(json_wtr, "offset", probe_offset);
break;
+ default:
+ break;
}
jsonw_end_object(json_wtr);
}
@@ -140,6 +142,8 @@ static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type,
printf("uretprobe filename %s offset %llu\n", buf,
probe_offset);
break;
+ default:
+ break;
}
}
diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h
index 0d35f18006a1..95c072b70d0e 100644
--- a/tools/include/linux/compiler-gcc.h
+++ b/tools/include/linux/compiler-gcc.h
@@ -6,9 +6,11 @@
/*
* Common definitions for all gcc versions go here.
*/
+#ifndef GCC_VERSION
#define GCC_VERSION (__GNUC__ * 10000 \
+ __GNUC_MINOR__ * 100 \
+ __GNUC_PATCHLEVEL__)
+#endif
#if GCC_VERSION >= 70000 && !defined(__CHECKER__)
# define __fallthrough __attribute__ ((fallthrough))
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0e66371bea13..77c6be96d676 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -106,6 +106,7 @@ enum bpf_cmd {
BPF_TASK_FD_QUERY,
BPF_MAP_LOOKUP_AND_DELETE_ELEM,
BPF_MAP_FREEZE,
+ BPF_BTF_GET_NEXT_ID,
};
enum bpf_map_type {
@@ -284,6 +285,9 @@ enum bpf_attach_type {
*/
#define BPF_F_TEST_RND_HI32 (1U << 2)
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_STATE_FREQ (1U << 3)
+
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
@@ -337,6 +341,9 @@ enum bpf_attach_type {
#define BPF_F_RDONLY_PROG (1U << 7)
#define BPF_F_WRONLY_PROG (1U << 8)
+/* Clone map from listener for newly accepted socket */
+#define BPF_F_CLONE (1U << 9)
+
/* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
@@ -576,6 +583,8 @@ union bpf_attr {
* limited to five).
*
* Each time the helper is called, it appends a line to the trace.
+ * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
* The format of the trace is customizable, and the exact output
* one will get depends on the options set in
* *\/sys/kernel/debug/tracing/trace_options* (see also the
@@ -1014,7 +1023,7 @@ union bpf_attr {
* The realm of the route for the packet associated to *skb*, or 0
* if none was found.
*
- * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
* Description
* Write raw *data* blob into a special BPF perf event held by
* *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1076,7 +1085,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
* Description
* Walk a user or a kernel stack and return its id. To achieve
* this, the helper needs *ctx*, which is a pointer to the context
@@ -1725,7 +1734,7 @@ union bpf_attr {
* Return
* 0 on success, or a negative error in case of failure.
*
- * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * int bpf_override_return(struct pt_regs *regs, u64 rc)
* Description
* Used for error injection, this helper uses kprobes to override
* the return value of the probed function, and to set it to *rc*.
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index faaa5ca2a117..be328c59389d 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -16,6 +16,18 @@
#define XDP_SHARED_UMEM (1 << 0)
#define XDP_COPY (1 << 1) /* Force copy-mode */
#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */
+/* If this option is set, the driver might go sleep and in that case
+ * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
+ * set. If it is set, the application need to explicitly wake up the
+ * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
+ * running the driver and the application on the same core, you should
+ * use this option so that the kernel will yield to the user space
+ * application.
+ */
+#define XDP_USE_NEED_WAKEUP (1 << 3)
+
+/* Flags for xsk_umem_config flags */
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
struct sockaddr_xdp {
__u16 sxdp_family;
@@ -25,10 +37,14 @@ struct sockaddr_xdp {
__u32 sxdp_shared_umem_fd;
};
+/* XDP_RING flags */
+#define XDP_RING_NEED_WAKEUP (1 << 0)
+
struct xdp_ring_offset {
__u64 producer;
__u64 consumer;
__u64 desc;
+ __u64 flags;
};
struct xdp_mmap_offsets {
@@ -53,6 +69,7 @@ struct xdp_umem_reg {
__u64 len; /* Length of packet data area */
__u32 chunk_size;
__u32 headroom;
+ __u32 flags;
};
struct xdp_statistics {
@@ -74,6 +91,11 @@ struct xdp_options {
#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL
#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL
+/* Masks for unaligned chunks mode */
+#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
+#define XSK_UNALIGNED_BUF_ADDR_MASK \
+ ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+
/* Rx/Tx descriptor */
struct xdp_desc {
__u64 addr;
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 9312066a1ae3..c6f94cffe06e 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -1,9 +1,10 @@
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
# Most of this file is copied from tools/lib/traceevent/Makefile
-BPF_VERSION = 0
-BPF_PATCHLEVEL = 0
-BPF_EXTRAVERSION = 4
+LIBBPF_VERSION := $(shell \
+ grep -oE '^LIBBPF_([0-9.]+)' libbpf.map | \
+ sort -rV | head -n1 | cut -d'_' -f2)
+LIBBPF_MAJOR_VERSION := $(firstword $(subst ., ,$(LIBBPF_VERSION)))
MAKEFLAGS += --no-print-directory
@@ -79,15 +80,9 @@ export prefix libdir src obj
libdir_SQ = $(subst ','\'',$(libdir))
libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
-VERSION = $(BPF_VERSION)
-PATCHLEVEL = $(BPF_PATCHLEVEL)
-EXTRAVERSION = $(BPF_EXTRAVERSION)
-
OBJ = $@
N =
-LIBBPF_VERSION = $(BPF_VERSION).$(BPF_PATCHLEVEL).$(BPF_EXTRAVERSION)
-
LIB_TARGET = libbpf.a libbpf.so.$(LIBBPF_VERSION)
LIB_FILE = libbpf.a libbpf.so*
PC_FILE = libbpf.pc
@@ -113,6 +108,7 @@ override CFLAGS += -Werror -Wall
override CFLAGS += -fPIC
override CFLAGS += $(INCLUDES)
override CFLAGS += -fvisibility=hidden
+override CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
ifeq ($(VERBOSE),1)
Q =
@@ -138,7 +134,9 @@ LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE))
PC_FILE := $(addprefix $(OUTPUT),$(PC_FILE))
GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN) | \
- awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print s}')
+ cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \
+ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}' | \
+ sort -u | wc -l)
VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \
grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l)
@@ -178,10 +176,10 @@ $(BPF_IN): force elfdep bpfdep
$(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
$(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN)
- $(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(VERSION) \
+ $(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \
-Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -o $@
@ln -sf $(@F) $(OUTPUT)libbpf.so
- @ln -sf $(@F) $(OUTPUT)libbpf.so.$(VERSION)
+ @ln -sf $(@F) $(OUTPUT)libbpf.so.$(LIBBPF_MAJOR_VERSION)
$(OUTPUT)libbpf.a: $(BPF_IN)
$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
@@ -205,6 +203,7 @@ check_abi: $(OUTPUT)libbpf.so
"Please make sure all LIBBPF_API symbols are" \
"versioned in $(VERSION_SCRIPT)." >&2; \
readelf -s --wide $(OUTPUT)libbpf-in.o | \
+ cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \
awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}'| \
sort -u > $(OUTPUT)libbpf_global_syms.tmp; \
readelf -s --wide $(OUTPUT)libbpf.so | \
@@ -257,7 +256,8 @@ config-clean:
clean:
$(call QUIET_CLEAN, libbpf) $(RM) $(TARGETS) $(CXX_TEST_TARGET) \
- *.o *~ *.a *.so *.so.$(VERSION) .*.d .*.cmd *.pc LIBBPF-CFLAGS
+ *.o *~ *.a *.so *.so.$(LIBBPF_MAJOR_VERSION) .*.d .*.cmd \
+ *.pc LIBBPF-CFLAGS
$(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index c7d7993c44bb..cbb933532981 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -568,7 +568,7 @@ int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr)
return ret;
}
-int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
+static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd)
{
union bpf_attr attr;
int err;
@@ -576,26 +576,26 @@ int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
memset(&attr, 0, sizeof(attr));
attr.start_id = start_id;
- err = sys_bpf(BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr));
+ err = sys_bpf(cmd, &attr, sizeof(attr));
if (!err)
*next_id = attr.next_id;
return err;
}
-int bpf_map_get_next_id(__u32 start_id, __u32 *next_id)
+int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
{
- union bpf_attr attr;
- int err;
-
- memset(&attr, 0, sizeof(attr));
- attr.start_id = start_id;
+ return bpf_obj_get_next_id(start_id, next_id, BPF_PROG_GET_NEXT_ID);
+}
- err = sys_bpf(BPF_MAP_GET_NEXT_ID, &attr, sizeof(attr));
- if (!err)
- *next_id = attr.next_id;
+int bpf_map_get_next_id(__u32 start_id, __u32 *next_id)
+{
+ return bpf_obj_get_next_id(start_id, next_id, BPF_MAP_GET_NEXT_ID);
+}
- return err;
+int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id)
+{
+ return bpf_obj_get_next_id(start_id, next_id, BPF_BTF_GET_NEXT_ID);
}
int bpf_prog_get_fd_by_id(__u32 id)
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index ff42ca043dc8..0db01334740f 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -156,6 +156,7 @@ LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data,
__u32 *retval, __u32 *duration);
LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id);
LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id);
+LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id);
LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id);
LIBBPF_API int bpf_map_get_fd_by_id(__u32 id);
LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index f9d316e873d8..d04c7cb623ed 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -183,4 +183,10 @@ LIBBPF_0.0.4 {
perf_buffer__new;
perf_buffer__new_raw;
perf_buffer__poll;
+ xsk_umem__create;
} LIBBPF_0.0.3;
+
+LIBBPF_0.0.5 {
+ global:
+ bpf_btf_get_next_id;
+} LIBBPF_0.0.4;
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 680e63066cf3..842c4fd55859 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -74,23 +74,6 @@ struct xsk_nl_info {
int fd;
};
-/* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit.
- * Unfortunately, it is not part of glibc.
- */
-static inline void *xsk_mmap(void *addr, size_t length, int prot, int flags,
- int fd, __u64 offset)
-{
-#ifdef __NR_mmap2
- unsigned int page_shift = __builtin_ffs(getpagesize()) - 1;
- long ret = syscall(__NR_mmap2, addr, length, prot, flags, fd,
- (off_t)(offset >> page_shift));
-
- return (void *)ret;
-#else
- return mmap(addr, length, prot, flags, fd, offset);
-#endif
-}
-
int xsk_umem__fd(const struct xsk_umem *umem)
{
return umem ? umem->fd : -EINVAL;
@@ -116,6 +99,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+ cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
return;
}
@@ -123,6 +107,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
cfg->comp_size = usr_cfg->comp_size;
cfg->frame_size = usr_cfg->frame_size;
cfg->frame_headroom = usr_cfg->frame_headroom;
+ cfg->flags = usr_cfg->flags;
}
static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
@@ -149,9 +134,10 @@ static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
return 0;
}
-int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
- struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
- const struct xsk_umem_config *usr_config)
+int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area,
+ __u64 size, struct xsk_ring_prod *fill,
+ struct xsk_ring_cons *comp,
+ const struct xsk_umem_config *usr_config)
{
struct xdp_mmap_offsets off;
struct xdp_umem_reg mr;
@@ -182,6 +168,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
mr.len = size;
mr.chunk_size = umem->config.frame_size;
mr.headroom = umem->config.frame_headroom;
+ mr.flags = umem->config.flags;
err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
if (err) {
@@ -210,10 +197,9 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
goto out_socket;
}
- map = xsk_mmap(NULL, off.fr.desc +
- umem->config.fill_size * sizeof(__u64),
- PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
- umem->fd, XDP_UMEM_PGOFF_FILL_RING);
+ map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, umem->fd,
+ XDP_UMEM_PGOFF_FILL_RING);
if (map == MAP_FAILED) {
err = -errno;
goto out_socket;
@@ -224,13 +210,13 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
fill->size = umem->config.fill_size;
fill->producer = map + off.fr.producer;
fill->consumer = map + off.fr.consumer;
+ fill->flags = map + off.fr.flags;
fill->ring = map + off.fr.desc;
fill->cached_cons = umem->config.fill_size;
- map = xsk_mmap(NULL,
- off.cr.desc + umem->config.comp_size * sizeof(__u64),
- PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
- umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
+ map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, umem->fd,
+ XDP_UMEM_PGOFF_COMPLETION_RING);
if (map == MAP_FAILED) {
err = -errno;
goto out_mmap;
@@ -241,6 +227,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
comp->size = umem->config.comp_size;
comp->producer = map + off.cr.producer;
comp->consumer = map + off.cr.consumer;
+ comp->flags = map + off.cr.flags;
comp->ring = map + off.cr.desc;
*umem_ptr = umem;
@@ -255,6 +242,29 @@ out_umem_alloc:
return err;
}
+struct xsk_umem_config_v1 {
+ __u32 fill_size;
+ __u32 comp_size;
+ __u32 frame_size;
+ __u32 frame_headroom;
+};
+
+int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
+ __u64 size, struct xsk_ring_prod *fill,
+ struct xsk_ring_cons *comp,
+ const struct xsk_umem_config *usr_config)
+{
+ struct xsk_umem_config config;
+
+ memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1));
+ config.flags = 0;
+
+ return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp,
+ &config);
+}
+asm(".symver xsk_umem__create_v0_0_2, xsk_umem__create@LIBBPF_0.0.2");
+asm(".symver xsk_umem__create_v0_0_4, xsk_umem__create@@LIBBPF_0.0.4");
+
static int xsk_load_xdp_prog(struct xsk_socket *xsk)
{
static const int log_buf_size = 16 * 1024;
@@ -550,11 +560,10 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
}
if (rx) {
- rx_map = xsk_mmap(NULL, off.rx.desc +
- xsk->config.rx_size * sizeof(struct xdp_desc),
- PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE,
- xsk->fd, XDP_PGOFF_RX_RING);
+ rx_map = mmap(NULL, off.rx.desc +
+ xsk->config.rx_size * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+ xsk->fd, XDP_PGOFF_RX_RING);
if (rx_map == MAP_FAILED) {
err = -errno;
goto out_socket;
@@ -564,16 +573,16 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
rx->size = xsk->config.rx_size;
rx->producer = rx_map + off.rx.producer;
rx->consumer = rx_map + off.rx.consumer;
+ rx->flags = rx_map + off.rx.flags;
rx->ring = rx_map + off.rx.desc;
}
xsk->rx = rx;
if (tx) {
- tx_map = xsk_mmap(NULL, off.tx.desc +
- xsk->config.tx_size * sizeof(struct xdp_desc),
- PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE,
- xsk->fd, XDP_PGOFF_TX_RING);
+ tx_map = mmap(NULL, off.tx.desc +
+ xsk->config.tx_size * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+ xsk->fd, XDP_PGOFF_TX_RING);
if (tx_map == MAP_FAILED) {
err = -errno;
goto out_mmap_rx;
@@ -583,6 +592,7 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
tx->size = xsk->config.tx_size;
tx->producer = tx_map + off.tx.producer;
tx->consumer = tx_map + off.tx.consumer;
+ tx->flags = tx_map + off.tx.flags;
tx->ring = tx_map + off.tx.desc;
tx->cached_cons = xsk->config.tx_size;
}
diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
index 833a6e60d065..584f6820a639 100644
--- a/tools/lib/bpf/xsk.h
+++ b/tools/lib/bpf/xsk.h
@@ -32,6 +32,7 @@ struct name { \
__u32 *producer; \
__u32 *consumer; \
void *ring; \
+ __u32 *flags; \
}
DEFINE_XSK_RING(xsk_ring_prod);
@@ -76,6 +77,11 @@ xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx)
return &descs[idx & rx->mask];
}
+static inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r)
+{
+ return *r->flags & XDP_RING_NEED_WAKEUP;
+}
+
static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
{
__u32 free_entries = r->cached_cons - r->cached_prod;
@@ -162,6 +168,21 @@ static inline void *xsk_umem__get_data(void *umem_area, __u64 addr)
return &((char *)umem_area)[addr];
}
+static inline __u64 xsk_umem__extract_addr(__u64 addr)
+{
+ return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline __u64 xsk_umem__extract_offset(__u64 addr)
+{
+ return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline __u64 xsk_umem__add_offset_to_addr(__u64 addr)
+{
+ return xsk_umem__extract_addr(addr) + xsk_umem__extract_offset(addr);
+}
+
LIBBPF_API int xsk_umem__fd(const struct xsk_umem *umem);
LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
@@ -170,12 +191,14 @@ LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
#define XSK_UMEM__DEFAULT_FRAME_SHIFT 12 /* 4096 bytes */
#define XSK_UMEM__DEFAULT_FRAME_SIZE (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
+#define XSK_UMEM__DEFAULT_FLAGS 0
struct xsk_umem_config {
__u32 fill_size;
__u32 comp_size;
__u32 frame_size;
__u32 frame_headroom;
+ __u32 flags;
};
/* Flags for the libbpf_flags field. */
@@ -195,6 +218,16 @@ LIBBPF_API int xsk_umem__create(struct xsk_umem **umem,
struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp,
const struct xsk_umem_config *config);
+LIBBPF_API int xsk_umem__create_v0_0_2(struct xsk_umem **umem,
+ void *umem_area, __u64 size,
+ struct xsk_ring_prod *fill,
+ struct xsk_ring_cons *comp,
+ const struct xsk_umem_config *config);
+LIBBPF_API int xsk_umem__create_v0_0_4(struct xsk_umem **umem,
+ void *umem_area, __u64 size,
+ struct xsk_ring_prod *fill,
+ struct xsk_ring_cons *comp,
+ const struct xsk_umem_config *config);
LIBBPF_API int xsk_socket__create(struct xsk_socket **xsk,
const char *ifname, __u32 queue_id,
struct xsk_umem *umem,
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 90f70d2c7c22..60c9338cd9b4 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -42,4 +42,5 @@ xdping
test_sockopt
test_sockopt_sk
test_sockopt_multi
+test_sockopt_inherit
test_tcp_rtt
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index d69c541e2039..9eef5edf17be 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
test_cgroup_storage test_select_reuseport test_section_names \
test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
- test_sockopt_multi test_tcp_rtt
+ test_sockopt_multi test_sockopt_inherit test_tcp_rtt
BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
TEST_GEN_FILES = $(BPF_OBJ_FILES)
@@ -66,7 +66,8 @@ TEST_PROGS := test_kmod.sh \
test_tcp_check_syncookie.sh \
test_tc_tunnel.sh \
test_tc_edt.sh \
- test_xdping.sh
+ test_xdping.sh \
+ test_bpftool_build.sh
TEST_PROGS_EXTENDED := with_addr.sh \
with_tunnels.sh \
@@ -115,6 +116,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
$(OUTPUT)/test_sockopt: cgroup_helpers.c
$(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
$(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
+$(OUTPUT)/test_sockopt_inherit: cgroup_helpers.c
$(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
.PHONY: force
diff --git a/tools/testing/selftests/bpf/bpf_endian.h b/tools/testing/selftests/bpf/bpf_endian.h
index 05f036df8a4c..fbe28008450f 100644
--- a/tools/testing/selftests/bpf/bpf_endian.h
+++ b/tools/testing/selftests/bpf/bpf_endian.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __BPF_ENDIAN__
#define __BPF_ENDIAN__
@@ -29,6 +29,10 @@
# define __bpf_htonl(x) __builtin_bswap32(x)
# define __bpf_constant_ntohl(x) ___constant_swab32(x)
# define __bpf_constant_htonl(x) ___constant_swab32(x)
+# define __bpf_be64_to_cpu(x) __builtin_bswap64(x)
+# define __bpf_cpu_to_be64(x) __builtin_bswap64(x)
+# define __bpf_constant_be64_to_cpu(x) ___constant_swab64(x)
+# define __bpf_constant_cpu_to_be64(x) ___constant_swab64(x)
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
# define __bpf_ntohs(x) (x)
# define __bpf_htons(x) (x)
@@ -38,6 +42,10 @@
# define __bpf_htonl(x) (x)
# define __bpf_constant_ntohl(x) (x)
# define __bpf_constant_htonl(x) (x)
+# define __bpf_be64_to_cpu(x) (x)
+# define __bpf_cpu_to_be64(x) (x)
+# define __bpf_constant_be64_to_cpu(x) (x)
+# define __bpf_constant_cpu_to_be64(x) (x)
#else
# error "Fix your compiler's __BYTE_ORDER__?!"
#endif
@@ -54,5 +62,11 @@
#define bpf_ntohl(x) \
(__builtin_constant_p(x) ? \
__bpf_constant_ntohl(x) : __bpf_ntohl(x))
+#define bpf_cpu_to_be64(x) \
+ (__builtin_constant_p(x) ? \
+ __bpf_constant_cpu_to_be64(x) : __bpf_cpu_to_be64(x))
+#define bpf_be64_to_cpu(x) \
+ (__builtin_constant_p(x) ? \
+ __bpf_constant_be64_to_cpu(x) : __bpf_be64_to_cpu(x))
#endif /* __BPF_ENDIAN__ */
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 8b503ea142f0..6c4930bc6e2e 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __BPF_HELPERS_H
#define __BPF_HELPERS_H
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
index fb5840a62548..f10029821e16 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
@@ -48,16 +48,17 @@ void test_bpf_obj_id(void)
/* test_obj_id.o is a dumb prog. It should never fail
* to load.
*/
- if (err)
- error_cnt++;
- assert(!err);
+ if (CHECK_FAIL(err))
+ continue;
/* Insert a magic value to the map */
map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
- assert(map_fds[i] >= 0);
+ if (CHECK_FAIL(map_fds[i] < 0))
+ goto done;
err = bpf_map_update_elem(map_fds[i], &array_key,
&array_magic_value, 0);
- assert(!err);
+ if (CHECK_FAIL(err))
+ goto done;
/* Check getting map info */
info_len = sizeof(struct bpf_map_info) * 2;
@@ -96,9 +97,11 @@ void test_bpf_obj_id(void)
prog_infos[i].map_ids = ptr_to_u64(map_ids + i);
prog_infos[i].nr_map_ids = 2;
err = clock_gettime(CLOCK_REALTIME, &real_time_ts);
- assert(!err);
+ if (CHECK_FAIL(err))
+ goto done;
err = clock_gettime(CLOCK_BOOTTIME, &boot_time_ts);
- assert(!err);
+ if (CHECK_FAIL(err))
+ goto done;
err = bpf_obj_get_info_by_fd(prog_fds[i], &prog_infos[i],
&info_len);
load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec)
@@ -224,7 +227,8 @@ void test_bpf_obj_id(void)
nr_id_found++;
err = bpf_map_lookup_elem(map_fd, &array_key, &array_value);
- assert(!err);
+ if (CHECK_FAIL(err))
+ goto done;
err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
CHECK(err || info_len != sizeof(struct bpf_map_info) ||
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
index 1a1eae356f81..1c01ee2600a9 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
@@ -28,8 +28,6 @@ static int check_load(const char *file, enum bpf_prog_type type)
attr.prog_flags = BPF_F_TEST_RND_HI32;
err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
bpf_object__close(obj);
- if (err)
- error_cnt++;
return err;
}
@@ -105,12 +103,7 @@ void test_bpf_verif_scale(void)
continue;
err = check_load(test->file, test->attach_type);
- if (test->fails) { /* expected to fail */
- if (err)
- error_cnt--;
- else
- error_cnt++;
- }
+ CHECK_FAIL(err && !test->fails);
}
if (env.verifier_stats)
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index 6892b88ae065..92563898867c 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -344,7 +344,6 @@ struct test tests[] = {
.tcp.dest = 8080,
},
.keys = {
- .nhoff = 0,
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct iphdr) +
sizeof(struct iphdr),
@@ -452,10 +451,8 @@ void test_flow_dissector(void)
err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector",
"jmp_table", "last_dissection", &prog_fd, &keys_fd);
- if (err) {
- error_cnt++;
+ if (CHECK_FAIL(err))
return;
- }
for (i = 0; i < ARRAY_SIZE(tests); i++) {
struct bpf_flow_keys flow_keys;
diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
index 3d59b3c841fe..eba9a970703b 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
@@ -135,10 +135,7 @@ void test_get_stack_raw_tp(void)
exp_cnt -= err;
}
- goto close_prog_noerr;
close_prog:
- error_cnt++;
-close_prog_noerr:
if (!IS_ERR_OR_NULL(link))
bpf_link__destroy(link);
if (!IS_ERR_OR_NULL(pb))
diff --git a/tools/testing/selftests/bpf/prog_tests/global_data.c b/tools/testing/selftests/bpf/prog_tests/global_data.c
index d011079fb0bf..c680926fce73 100644
--- a/tools/testing/selftests/bpf/prog_tests/global_data.c
+++ b/tools/testing/selftests/bpf/prog_tests/global_data.c
@@ -7,10 +7,8 @@ static void test_global_data_number(struct bpf_object *obj, __u32 duration)
uint64_t num;
map_fd = bpf_find_map(__func__, obj, "result_number");
- if (map_fd < 0) {
- error_cnt++;
+ if (CHECK_FAIL(map_fd < 0))
return;
- }
struct {
char *name;
@@ -44,10 +42,8 @@ static void test_global_data_string(struct bpf_object *obj, __u32 duration)
char str[32];
map_fd = bpf_find_map(__func__, obj, "result_string");
- if (map_fd < 0) {
- error_cnt++;
+ if (CHECK_FAIL(map_fd < 0))
return;
- }
struct {
char *name;
@@ -81,10 +77,8 @@ static void test_global_data_struct(struct bpf_object *obj, __u32 duration)
struct foo val;
map_fd = bpf_find_map(__func__, obj, "result_struct");
- if (map_fd < 0) {
- error_cnt++;
+ if (CHECK_FAIL(map_fd < 0))
return;
- }
struct {
char *name;
@@ -112,16 +106,12 @@ static void test_global_data_rdonly(struct bpf_object *obj, __u32 duration)
__u8 *buff;
map = bpf_object__find_map_by_name(obj, "test_glo.rodata");
- if (!map || !bpf_map__is_internal(map)) {
- error_cnt++;
+ if (CHECK_FAIL(!map || !bpf_map__is_internal(map)))
return;
- }
map_fd = bpf_map__fd(map);
- if (map_fd < 0) {
- error_cnt++;
+ if (CHECK_FAIL(map_fd < 0))
return;
- }
buff = malloc(bpf_map__def(map)->value_size);
if (buff)
diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
index 20ddca830e68..eaf64595be88 100644
--- a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
+++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
@@ -30,10 +30,8 @@ static void test_l4lb(const char *file)
u32 *magic = (u32 *)buf;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
- if (err) {
- error_cnt++;
+ if (CHECK_FAIL(err))
return;
- }
map_fd = bpf_find_map(__func__, obj, "vip_map");
if (map_fd < 0)
@@ -72,10 +70,9 @@ static void test_l4lb(const char *file)
bytes += stats[i].bytes;
pkts += stats[i].pkts;
}
- if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
- error_cnt++;
+ if (CHECK_FAIL(bytes != MAGIC_BYTES * NUM_ITER * 2 ||
+ pkts != NUM_ITER * 2))
printf("test_l4lb:FAIL:stats %lld %lld\n", bytes, pkts);
- }
out:
bpf_object__close(obj);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/map_lock.c b/tools/testing/selftests/bpf/prog_tests/map_lock.c
index ee99368c595c..8f91f1881d11 100644
--- a/tools/testing/selftests/bpf/prog_tests/map_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/map_lock.c
@@ -8,14 +8,12 @@ static void *parallel_map_access(void *arg)
for (i = 0; i < 10000; i++) {
err = bpf_map_lookup_elem_flags(map_fd, &key, vars, BPF_F_LOCK);
- if (err) {
+ if (CHECK_FAIL(err)) {
printf("lookup failed\n");
- error_cnt++;
goto out;
}
- if (vars[0] != 0) {
+ if (CHECK_FAIL(vars[0] != 0)) {
printf("lookup #%d var[0]=%d\n", i, vars[0]);
- error_cnt++;
goto out;
}
rnd = vars[1];
@@ -24,7 +22,7 @@ static void *parallel_map_access(void *arg)
continue;
printf("lookup #%d var[1]=%d var[%d]=%d\n",
i, rnd, j, vars[j]);
- error_cnt++;
+ CHECK_FAIL(vars[j] != rnd);
goto out;
}
}
@@ -42,34 +40,36 @@ void test_map_lock(void)
void *ret;
err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
- if (err) {
+ if (CHECK_FAIL(err)) {
printf("test_map_lock:bpf_prog_load errno %d\n", errno);
goto close_prog;
}
map_fd[0] = bpf_find_map(__func__, obj, "hash_map");
- if (map_fd[0] < 0)
+ if (CHECK_FAIL(map_fd[0] < 0))
goto close_prog;
map_fd[1] = bpf_find_map(__func__, obj, "array_map");
- if (map_fd[1] < 0)
+ if (CHECK_FAIL(map_fd[1] < 0))
goto close_prog;
bpf_map_update_elem(map_fd[0], &key, vars, BPF_F_LOCK);
for (i = 0; i < 4; i++)
- assert(pthread_create(&thread_id[i], NULL,
- &spin_lock_thread, &prog_fd) == 0);
+ if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
+ &spin_lock_thread, &prog_fd)))
+ goto close_prog;
for (i = 4; i < 6; i++)
- assert(pthread_create(&thread_id[i], NULL,
- &parallel_map_access, &map_fd[i - 4]) == 0);
+ if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
+ &parallel_map_access,
+ &map_fd[i - 4])))
+ goto close_prog;
for (i = 0; i < 4; i++)
- assert(pthread_join(thread_id[i], &ret) == 0 &&
- ret == (void *)&prog_fd);
+ if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
+ ret != (void *)&prog_fd))
+ goto close_prog;
for (i = 4; i < 6; i++)
- assert(pthread_join(thread_id[i], &ret) == 0 &&
- ret == (void *)&map_fd[i - 4]);
- goto close_prog_noerr;
+ if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
+ ret != (void *)&map_fd[i - 4]))
+ goto close_prog;
close_prog:
- error_cnt++;
-close_prog_noerr:
bpf_object__close(obj);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
index 4ecfd721a044..a2537dfa899c 100644
--- a/tools/testing/selftests/bpf/prog_tests/pkt_access.c
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
@@ -9,10 +9,8 @@ void test_pkt_access(void)
int err, prog_fd;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
- if (err) {
- error_cnt++;
+ if (CHECK_FAIL(err))
return;
- }
err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
NULL, NULL, &retval, &duration);
diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
index ac0d43435806..5f7aea605019 100644
--- a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
@@ -9,10 +9,8 @@ void test_pkt_md_access(void)
int err, prog_fd;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
- if (err) {
- error_cnt++;
+ if (CHECK_FAIL(err))
return;
- }
err = bpf_prog_test_run(prog_fd, 10, &pkt_v4, sizeof(pkt_v4),
NULL, NULL, &retval, &duration);
diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
index e60cd5ff1f55..faccc66f4e39 100644
--- a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
@@ -27,10 +27,8 @@ static void test_queue_stack_map_by_type(int type)
return;
err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
- if (err) {
- error_cnt++;
+ if (CHECK_FAIL(err))
return;
- }
map_in_fd = bpf_find_map(__func__, obj, "map_in");
if (map_in_fd < 0)
@@ -43,10 +41,8 @@ static void test_queue_stack_map_by_type(int type)
/* Push 32 elements to the input map */
for (i = 0; i < MAP_SIZE; i++) {
err = bpf_map_update_elem(map_in_fd, NULL, &vals[i], 0);
- if (err) {
- error_cnt++;
+ if (CHECK_FAIL(err))
goto out;
- }
}
/* The eBPF program pushes iph.saddr in the output map,
diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
index 4a4f428d1a78..5c78e2b5a917 100644
--- a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
+++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
@@ -10,10 +10,8 @@ void test_reference_tracking(void)
int err = 0;
obj = bpf_object__open(file);
- if (IS_ERR(obj)) {
- error_cnt++;
+ if (CHECK_FAIL(IS_ERR(obj)))
return;
- }
bpf_object__for_each_program(prog, obj) {
const char *title;
diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 1575f0a1f586..b607112c64e7 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -8,7 +8,7 @@ static void sigusr1_handler(int signum)
sigusr1_received++;
}
-static int test_send_signal_common(struct perf_event_attr *attr,
+static void test_send_signal_common(struct perf_event_attr *attr,
int prog_type,
const char *test_name)
{
@@ -23,13 +23,13 @@ static int test_send_signal_common(struct perf_event_attr *attr,
if (CHECK(pipe(pipe_c2p), test_name,
"pipe pipe_c2p error: %s\n", strerror(errno)))
- goto no_fork_done;
+ return;
if (CHECK(pipe(pipe_p2c), test_name,
"pipe pipe_p2c error: %s\n", strerror(errno))) {
close(pipe_c2p[0]);
close(pipe_c2p[1]);
- goto no_fork_done;
+ return;
}
pid = fork();
@@ -38,7 +38,7 @@ static int test_send_signal_common(struct perf_event_attr *attr,
close(pipe_c2p[1]);
close(pipe_p2c[0]);
close(pipe_p2c[1]);
- goto no_fork_done;
+ return;
}
if (pid == 0) {
@@ -125,7 +125,7 @@ static int test_send_signal_common(struct perf_event_attr *attr,
goto disable_pmu;
}
- err = CHECK(buf[0] != '2', test_name, "incorrect result\n");
+ CHECK(buf[0] != '2', test_name, "incorrect result\n");
/* notify child safe to exit */
write(pipe_p2c[1], buf, 1);
@@ -138,11 +138,9 @@ prog_load_failure:
close(pipe_c2p[0]);
close(pipe_p2c[1]);
wait(NULL);
-no_fork_done:
- return err;
}
-static int test_send_signal_tracepoint(void)
+static void test_send_signal_tracepoint(void)
{
const char *id_path = "/sys/kernel/debug/tracing/events/syscalls/sys_enter_nanosleep/id";
struct perf_event_attr attr = {
@@ -159,21 +157,21 @@ static int test_send_signal_tracepoint(void)
if (CHECK(efd < 0, "tracepoint",
"open syscalls/sys_enter_nanosleep/id failure: %s\n",
strerror(errno)))
- return -1;
+ return;
bytes = read(efd, buf, sizeof(buf));
close(efd);
if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "tracepoint",
"read syscalls/sys_enter_nanosleep/id failure: %s\n",
strerror(errno)))
- return -1;
+ return;
attr.config = strtol(buf, NULL, 0);
- return test_send_signal_common(&attr, BPF_PROG_TYPE_TRACEPOINT, "tracepoint");
+ test_send_signal_common(&attr, BPF_PROG_TYPE_TRACEPOINT, "tracepoint");
}
-static int test_send_signal_perf(void)
+static void test_send_signal_perf(void)
{
struct perf_event_attr attr = {
.sample_period = 1,
@@ -181,11 +179,11 @@ static int test_send_signal_perf(void)
.config = PERF_COUNT_SW_CPU_CLOCK,
};
- return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT,
- "perf_sw_event");
+ test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT,
+ "perf_sw_event");
}
-static int test_send_signal_nmi(void)
+static void test_send_signal_nmi(void)
{
struct perf_event_attr attr = {
.sample_freq = 50,
@@ -204,25 +202,24 @@ static int test_send_signal_nmi(void)
if (errno == ENOENT) {
printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n",
__func__);
- return 0;
+ test__skip();
+ return;
}
/* Let the test fail with a more informative message */
} else {
close(pmu_fd);
}
- return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT,
- "perf_hw_event");
+ test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT,
+ "perf_hw_event");
}
void test_send_signal(void)
{
- int ret = 0;
-
if (test__start_subtest("send_signal_tracepoint"))
- ret |= test_send_signal_tracepoint();
+ test_send_signal_tracepoint();
if (test__start_subtest("send_signal_perf"))
- ret |= test_send_signal_perf();
+ test_send_signal_perf();
if (test__start_subtest("send_signal_nmi"))
- ret |= test_send_signal_nmi();
+ test_send_signal_nmi();
}
diff --git a/tools/testing/selftests/bpf/prog_tests/spinlock.c b/tools/testing/selftests/bpf/prog_tests/spinlock.c
index 114ebe6a438e..1ae00cd3174e 100644
--- a/tools/testing/selftests/bpf/prog_tests/spinlock.c
+++ b/tools/testing/selftests/bpf/prog_tests/spinlock.c
@@ -11,19 +11,19 @@ void test_spinlock(void)
void *ret;
err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
- if (err) {
+ if (CHECK_FAIL(err)) {
printf("test_spin_lock:bpf_prog_load errno %d\n", errno);
goto close_prog;
}
for (i = 0; i < 4; i++)
- assert(pthread_create(&thread_id[i], NULL,
- &spin_lock_thread, &prog_fd) == 0);
+ if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
+ &spin_lock_thread, &prog_fd)))
+ goto close_prog;
+
for (i = 0; i < 4; i++)
- assert(pthread_join(thread_id[i], &ret) == 0 &&
- ret == (void *)&prog_fd);
- goto close_prog_noerr;
+ if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
+ ret != (void *)&prog_fd))
+ goto close_prog;
close_prog:
- error_cnt++;
-close_prog_noerr:
bpf_object__close(obj);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
index ac44fda84833..d841dced971f 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
@@ -51,9 +51,10 @@ retry:
"err %d errno %d\n", err, errno))
goto disable_pmu;
- assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
- == 0);
- assert(system("./urandom_read") == 0);
+ if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")))
+ goto disable_pmu;
+ if (CHECK_FAIL(system("./urandom_read")))
+ goto disable_pmu;
/* disable stack trace collection */
key = 0;
val = 1;
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
index 9557b7dfb782..f62aa0eb959b 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
@@ -82,9 +82,10 @@ retry:
"err %d errno %d\n", err, errno))
goto disable_pmu;
- assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
- == 0);
- assert(system("taskset 0x1 ./urandom_read 100000") == 0);
+ if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")))
+ goto disable_pmu;
+ if (CHECK_FAIL(system("taskset 0x1 ./urandom_read 100000")))
+ goto disable_pmu;
/* disable stack trace collection */
key = 0;
val = 1;
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
index fc539335c5b3..37269d23df93 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
@@ -26,19 +26,19 @@ void test_stacktrace_map(void)
/* find map fds */
control_map_fd = bpf_find_map(__func__, obj, "control_map");
- if (control_map_fd < 0)
+ if (CHECK_FAIL(control_map_fd < 0))
goto disable_pmu;
stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
- if (stackid_hmap_fd < 0)
+ if (CHECK_FAIL(stackid_hmap_fd < 0))
goto disable_pmu;
stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
- if (stackmap_fd < 0)
+ if (CHECK_FAIL(stackmap_fd < 0))
goto disable_pmu;
stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
- if (stack_amap_fd < 0)
+ if (CHECK_FAIL(stack_amap_fd < 0))
goto disable_pmu;
/* give some time for bpf program run */
@@ -55,23 +55,20 @@ void test_stacktrace_map(void)
err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
"err %d errno %d\n", err, errno))
- goto disable_pmu_noerr;
+ goto disable_pmu;
err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
"err %d errno %d\n", err, errno))
- goto disable_pmu_noerr;
+ goto disable_pmu;
stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
"err %d errno %d\n", err, errno))
- goto disable_pmu_noerr;
+ goto disable_pmu;
- goto disable_pmu_noerr;
disable_pmu:
- error_cnt++;
-disable_pmu_noerr:
bpf_link__destroy(link);
close_prog:
bpf_object__close(obj);
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
index fbfa8e76cf63..404a5498e1a3 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
@@ -26,15 +26,15 @@ void test_stacktrace_map_raw_tp(void)
/* find map fds */
control_map_fd = bpf_find_map(__func__, obj, "control_map");
- if (control_map_fd < 0)
+ if (CHECK_FAIL(control_map_fd < 0))
goto close_prog;
stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
- if (stackid_hmap_fd < 0)
+ if (CHECK_FAIL(stackid_hmap_fd < 0))
goto close_prog;
stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
- if (stackmap_fd < 0)
+ if (CHECK_FAIL(stackmap_fd < 0))
goto close_prog;
/* give some time for bpf program run */
@@ -58,10 +58,7 @@ void test_stacktrace_map_raw_tp(void)
"err %d errno %d\n", err, errno))
goto close_prog;
- goto close_prog_noerr;
close_prog:
- error_cnt++;
-close_prog_noerr:
if (!IS_ERR_OR_NULL(link))
bpf_link__destroy(link);
bpf_object__close(obj);
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
index 958a3d88de99..1bdc1d86a50c 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
@@ -70,9 +70,6 @@ void test_task_fd_query_rawtp(void)
if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
goto close_prog;
- goto close_prog_noerr;
close_prog:
- error_cnt++;
-close_prog_noerr:
bpf_object__close(obj);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
index f9b70e81682b..3f131b8fe328 100644
--- a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
@@ -62,14 +62,9 @@ static void test_task_fd_query_tp_core(const char *probe_name,
fd_type, buf))
goto close_pmu;
- close(pmu_fd);
- goto close_prog_noerr;
-
close_pmu:
close(pmu_fd);
close_prog:
- error_cnt++;
-close_prog_noerr:
bpf_object__close(obj);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
index bb8759d69099..594307dffd13 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
@@ -10,10 +10,8 @@ void test_tcp_estats(void)
err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
CHECK(err, "", "err %d errno %d\n", err, errno);
- if (err) {
- error_cnt++;
+ if (err)
return;
- }
bpf_object__close(obj);
}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c
index a74167289545..dcb5ecac778e 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp.c
@@ -16,10 +16,8 @@ void test_xdp(void)
int err, prog_fd, map_fd;
err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
- if (err) {
- error_cnt++;
+ if (CHECK_FAIL(err))
return;
- }
map_fd = bpf_find_map(__func__, obj, "vip2tnl");
if (map_fd < 0)
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
index 922aa0a19764..3744196d7cba 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
@@ -10,10 +10,8 @@ void test_xdp_adjust_tail(void)
int err, prog_fd;
err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
- if (err) {
- error_cnt++;
+ if (CHECK_FAIL(err))
return;
- }
err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
buf, &size, &retval, &duration);
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
index 15f7c272edb0..c9404e6b226e 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
@@ -31,10 +31,8 @@ void test_xdp_noinline(void)
u32 *magic = (u32 *)buf;
err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
- if (err) {
- error_cnt++;
+ if (CHECK_FAIL(err))
return;
- }
map_fd = bpf_find_map(__func__, obj, "vip_map");
if (map_fd < 0)
@@ -73,8 +71,8 @@ void test_xdp_noinline(void)
bytes += stats[i].bytes;
pkts += stats[i].pkts;
}
- if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
- error_cnt++;
+ if (CHECK_FAIL(bytes != MAGIC_BYTES * NUM_ITER * 2 ||
+ pkts != NUM_ITER * 2)) {
printf("test_xdp_noinline:FAIL:stats %lld %lld\n",
bytes, pkts);
}
diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
new file mode 100644
index 000000000000..dede0fcd6102
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+#define SOL_CUSTOM 0xdeadbeef
+#define CUSTOM_INHERIT1 0
+#define CUSTOM_INHERIT2 1
+#define CUSTOM_LISTENER 2
+
+struct sockopt_inherit {
+ __u8 val;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+ __type(key, int);
+ __type(value, struct sockopt_inherit);
+} cloned1_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+ __type(key, int);
+ __type(value, struct sockopt_inherit);
+} cloned2_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct sockopt_inherit);
+} listener_only_map SEC(".maps");
+
+static __inline struct sockopt_inherit *get_storage(struct bpf_sockopt *ctx)
+{
+ if (ctx->optname == CUSTOM_INHERIT1)
+ return bpf_sk_storage_get(&cloned1_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ else if (ctx->optname == CUSTOM_INHERIT2)
+ return bpf_sk_storage_get(&cloned2_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ else
+ return bpf_sk_storage_get(&listener_only_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+}
+
+SEC("cgroup/getsockopt")
+int _getsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ struct sockopt_inherit *storage;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_CUSTOM)
+ return 1; /* only interested in SOL_CUSTOM */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = get_storage(ctx);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ optval[0] = storage->val;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
+SEC("cgroup/setsockopt")
+int _setsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ struct sockopt_inherit *storage;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_CUSTOM)
+ return 1; /* only interested in SOL_CUSTOM */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = get_storage(ctx);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ storage->val = optval[0];
+ ctx->optlen = -1;
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c
index a334a0e882e4..41a3ebcd593d 100644
--- a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c
+++ b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c
@@ -12,10 +12,6 @@
#define SR6_FLAG_ALERT (1 << 4)
-#define htonll(x) ((bpf_htonl(1)) == 1 ? (x) : ((uint64_t)bpf_htonl((x) & \
- 0xFFFFFFFF) << 32) | bpf_htonl((x) >> 32))
-#define ntohll(x) ((bpf_ntohl(1)) == 1 ? (x) : ((uint64_t)bpf_ntohl((x) & \
- 0xFFFFFFFF) << 32) | bpf_ntohl((x) >> 32))
#define BPF_PACKET_HEADER __attribute__((packed))
struct ip6_t {
@@ -276,8 +272,8 @@ int has_egr_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh)
return 0;
// check if egress TLV value is correct
- if (ntohll(egr_addr.hi) == 0xfd00000000000000 &&
- ntohll(egr_addr.lo) == 0x4)
+ if (bpf_be64_to_cpu(egr_addr.hi) == 0xfd00000000000000 &&
+ bpf_be64_to_cpu(egr_addr.lo) == 0x4)
return 1;
}
@@ -308,8 +304,8 @@ int __encap_srh(struct __sk_buff *skb)
#pragma clang loop unroll(full)
for (unsigned long long lo = 0; lo < 4; lo++) {
- seg->lo = htonll(4 - lo);
- seg->hi = htonll(hi);
+ seg->lo = bpf_cpu_to_be64(4 - lo);
+ seg->hi = bpf_cpu_to_be64(hi);
seg = (struct ip6_addr_t *)((char *)seg + sizeof(*seg));
}
@@ -349,8 +345,8 @@ int __add_egr_x(struct __sk_buff *skb)
if (err)
return BPF_DROP;
- addr.lo = htonll(lo);
- addr.hi = htonll(hi);
+ addr.lo = bpf_cpu_to_be64(lo);
+ addr.hi = bpf_cpu_to_be64(hi);
err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X,
(void *)&addr, sizeof(addr));
if (err)
diff --git a/tools/testing/selftests/bpf/progs/test_seg6_loop.c b/tools/testing/selftests/bpf/progs/test_seg6_loop.c
index 1dbe1d4d467e..c4d104428643 100644
--- a/tools/testing/selftests/bpf/progs/test_seg6_loop.c
+++ b/tools/testing/selftests/bpf/progs/test_seg6_loop.c
@@ -12,10 +12,6 @@
#define SR6_FLAG_ALERT (1 << 4)
-#define htonll(x) ((bpf_htonl(1)) == 1 ? (x) : ((uint64_t)bpf_htonl((x) & \
- 0xFFFFFFFF) << 32) | bpf_htonl((x) >> 32))
-#define ntohll(x) ((bpf_ntohl(1)) == 1 ? (x) : ((uint64_t)bpf_ntohl((x) & \
- 0xFFFFFFFF) << 32) | bpf_ntohl((x) >> 32))
#define BPF_PACKET_HEADER __attribute__((packed))
struct ip6_t {
@@ -251,8 +247,8 @@ int __add_egr_x(struct __sk_buff *skb)
if (err)
return BPF_DROP;
- addr.lo = htonll(lo);
- addr.hi = htonll(hi);
+ addr.lo = bpf_cpu_to_be64(lo);
+ addr.hi = bpf_cpu_to_be64(hi);
err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X,
(void *)&addr, sizeof(addr));
if (err)
diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh
new file mode 100755
index 000000000000..4ba5a34bff56
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool_build.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+ERROR=0
+TMPDIR=
+
+# If one build fails, continue but return non-0 on exit.
+return_value() {
+ if [ -d "$TMPDIR" ] ; then
+ rm -rf -- $TMPDIR
+ fi
+ exit $ERROR
+}
+trap return_value EXIT
+
+case $1 in
+ -h|--help)
+ echo -e "$0 [-j <n>]"
+ echo -e "\tTest the different ways of building bpftool."
+ echo -e ""
+ echo -e "\tOptions:"
+ echo -e "\t\t-j <n>:\tPass -j flag to 'make'."
+ exit
+ ;;
+esac
+
+J=$*
+
+# Assume script is located under tools/testing/selftests/bpf/. We want to start
+# build attempts from the top of kernel repository.
+SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0)
+SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH)
+KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../)
+cd $KDIR_ROOT_DIR
+
+check() {
+ local dir=$(realpath $1)
+
+ echo -n "binary: "
+ # Returns non-null if file is found (and "false" is run)
+ find $dir -type f -executable -name bpftool -print -exec false {} + && \
+ ERROR=1 && printf "FAILURE: Did not find bpftool\n"
+}
+
+make_and_clean() {
+ echo -e "\$PWD: $PWD"
+ echo -e "command: make -s $* >/dev/null"
+ make $J -s $* >/dev/null
+ if [ $? -ne 0 ] ; then
+ ERROR=1
+ fi
+ if [ $# -ge 1 ] ; then
+ check ${@: -1}
+ else
+ check .
+ fi
+ (
+ if [ $# -ge 1 ] ; then
+ cd ${@: -1}
+ fi
+ make -s clean
+ )
+ echo
+}
+
+make_with_tmpdir() {
+ local ARGS
+
+ TMPDIR=$(mktemp -d)
+ if [ $# -ge 2 ] ; then
+ ARGS=${@:1:(($# - 1))}
+ fi
+ echo -e "\$PWD: $PWD"
+ echo -e "command: make -s $ARGS ${@: -1}=$TMPDIR/ >/dev/null"
+ make $J -s $ARGS ${@: -1}=$TMPDIR/ >/dev/null
+ if [ $? -ne 0 ] ; then
+ ERROR=1
+ fi
+ check $TMPDIR
+ rm -rf -- $TMPDIR
+ echo
+}
+
+echo "Trying to build bpftool"
+echo -e "... through kbuild\n"
+
+if [ -f ".config" ] ; then
+ make_and_clean tools/bpf
+
+ ## $OUTPUT is overwritten in kbuild Makefile, and thus cannot be passed
+ ## down from toplevel Makefile to bpftool's Makefile.
+
+ # make_with_tmpdir tools/bpf OUTPUT
+ echo -e "skip: make tools/bpf OUTPUT=<dir> (not supported)\n"
+
+ make_with_tmpdir tools/bpf O
+else
+ echo -e "skip: make tools/bpf (no .config found)\n"
+ echo -e "skip: make tools/bpf OUTPUT=<dir> (not supported)\n"
+ echo -e "skip: make tools/bpf O=<dir> (no .config found)\n"
+fi
+
+echo -e "... from kernel source tree\n"
+
+make_and_clean -C tools/bpf/bpftool
+
+make_with_tmpdir -C tools/bpf/bpftool OUTPUT
+
+make_with_tmpdir -C tools/bpf/bpftool O
+
+echo -e "... from tools/\n"
+cd tools/
+
+make_and_clean bpf
+
+## In tools/bpf/Makefile, function "descend" is called and passes $(O) and
+## $(OUTPUT). We would like $(OUTPUT) to have "bpf/bpftool/" appended before
+## calling bpftool's Makefile, but this is not the case as the "descend"
+## function focuses on $(O)/$(subdir). However, in the present case, updating
+## $(O) to have $(OUTPUT) recomputed from it in bpftool's Makefile does not
+## work, because $(O) is not defined from command line and $(OUTPUT) is not
+## updated in tools/scripts/Makefile.include.
+##
+## Workarounds would require to a) edit "descend" or use an alternative way to
+## call bpftool's Makefile, b) modify the conditions to update $(OUTPUT) and
+## other variables in tools/scripts/Makefile.include (at the risk of breaking
+## the build of other tools), or c) append manually the "bpf/bpftool" suffix to
+## $(OUTPUT) in bpf's Makefile, which may break if targets for other directories
+## use "descend" in the future.
+
+# make_with_tmpdir bpf OUTPUT
+echo -e "skip: make bpf OUTPUT=<dir> (not supported)\n"
+
+make_with_tmpdir bpf O
+
+echo -e "... from bpftool's dir\n"
+cd bpf/bpftool
+
+make_and_clean
+
+make_with_tmpdir OUTPUT
+
+make_with_tmpdir O
diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index 425f9ed27c3b..15a666329a34 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -1353,7 +1353,7 @@ try:
bpftool_prog_list_wait(expected=1)
ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"]
- fail(ifnameB != simB1['ifname'], "program not bound to originial device")
+ fail(ifnameB != simB1['ifname'], "program not bound to original device")
simB1.remove()
bpftool_prog_list_wait(expected=1)
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 12895d03d58b..e8616e778cb5 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -8,22 +8,20 @@
/* defined in test_progs.h */
struct test_env env;
-int error_cnt, pass_cnt;
struct prog_test_def {
const char *test_name;
int test_num;
void (*run_test)(void);
bool force_log;
- int pass_cnt;
int error_cnt;
+ int skip_cnt;
bool tested;
const char *subtest_name;
int subtest_num;
/* store counts before subtest started */
- int old_pass_cnt;
int old_error_cnt;
};
@@ -47,6 +45,7 @@ static void dump_test_log(const struct prog_test_def *test, bool failed)
if (env.verbose || test->force_log || failed) {
if (env.log_cnt) {
+ env.log_buf[env.log_cnt] = '\0';
fprintf(env.stdout, "%s", env.log_buf);
if (env.log_buf[env.log_cnt - 1] != '\n')
fprintf(env.stdout, "\n");
@@ -56,15 +55,24 @@ static void dump_test_log(const struct prog_test_def *test, bool failed)
fseeko(stdout, 0, SEEK_SET); /* rewind */
}
+static void skip_account(void)
+{
+ if (env.test->skip_cnt) {
+ env.skip_cnt++;
+ env.test->skip_cnt = 0;
+ }
+}
+
void test__end_subtest()
{
struct prog_test_def *test = env.test;
- int sub_error_cnt = error_cnt - test->old_error_cnt;
+ int sub_error_cnt = test->error_cnt - test->old_error_cnt;
if (sub_error_cnt)
env.fail_cnt++;
else
env.sub_succ_cnt++;
+ skip_account();
dump_test_log(test, sub_error_cnt);
@@ -95,8 +103,7 @@ bool test__start_subtest(const char *name)
return false;
test->subtest_name = name;
- env.test->old_pass_cnt = pass_cnt;
- env.test->old_error_cnt = error_cnt;
+ env.test->old_error_cnt = env.test->error_cnt;
return true;
}
@@ -105,6 +112,16 @@ void test__force_log() {
env.test->force_log = true;
}
+void test__skip(void)
+{
+ env.test->skip_cnt++;
+}
+
+void test__fail(void)
+{
+ env.test->error_cnt++;
+}
+
struct ipv4_packet pkt_v4 = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
.iph.ihl = 5,
@@ -129,7 +146,7 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name)
map = bpf_object__find_map_by_name(obj, name);
if (!map) {
printf("%s:FAIL:map '%s' not found\n", test, name);
- error_cnt++;
+ test__fail();
return -1;
}
return bpf_map__fd(map);
@@ -488,8 +505,6 @@ int main(int argc, char **argv)
stdio_hijack();
for (i = 0; i < prog_test_cnt; i++) {
struct prog_test_def *test = &prog_test_defs[i];
- int old_pass_cnt = pass_cnt;
- int old_error_cnt = error_cnt;
env.test = test;
test->test_num = i + 1;
@@ -504,12 +519,11 @@ int main(int argc, char **argv)
test__end_subtest();
test->tested = true;
- test->pass_cnt = pass_cnt - old_pass_cnt;
- test->error_cnt = error_cnt - old_error_cnt;
if (test->error_cnt)
env.fail_cnt++;
else
env.succ_cnt++;
+ skip_account();
dump_test_log(test, test->error_cnt);
@@ -518,11 +532,11 @@ int main(int argc, char **argv)
test->error_cnt ? "FAIL" : "OK");
}
stdio_restore();
- printf("Summary: %d/%d PASSED, %d FAILED\n",
- env.succ_cnt, env.sub_succ_cnt, env.fail_cnt);
+ printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n",
+ env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt);
free(env.test_selector.num_set);
free(env.subtest_selector.num_set);
- return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
+ return env.fail_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
}
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 37d427f5a1e5..c8edb9464ba6 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -38,8 +38,6 @@ typedef __u16 __sum16;
#include "trace_helpers.h"
#include "flow_dissector_load.h"
-struct prog_test_def;
-
struct test_selector {
const char *name;
bool *num_set;
@@ -64,14 +62,15 @@ struct test_env {
int succ_cnt; /* successful tests */
int sub_succ_cnt; /* successful sub-tests */
int fail_cnt; /* total failed tests + sub-tests */
+ int skip_cnt; /* skipped tests */
};
-extern int error_cnt;
-extern int pass_cnt;
extern struct test_env env;
extern void test__force_log();
extern bool test__start_subtest(const char *name);
+extern void test__skip(void);
+extern void test__fail(void);
#define MAGIC_BYTES 123
@@ -94,17 +93,25 @@ extern struct ipv6_packet pkt_v6;
#define _CHECK(condition, tag, duration, format...) ({ \
int __ret = !!(condition); \
if (__ret) { \
- error_cnt++; \
+ test__fail(); \
printf("%s:FAIL:%s ", __func__, tag); \
printf(format); \
} else { \
- pass_cnt++; \
printf("%s:PASS:%s %d nsec\n", \
__func__, tag, duration); \
} \
__ret; \
})
+#define CHECK_FAIL(condition) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ test__fail(); \
+ printf("%s:FAIL:%d\n", __func__, __LINE__); \
+ } \
+ __ret; \
+})
+
#define CHECK(condition, tag, format...) \
_CHECK(condition, tag, duration, format)
#define CHECK_ATTR(condition, tag, format...) \
diff --git a/tools/testing/selftests/bpf/test_sockopt_inherit.c b/tools/testing/selftests/bpf/test_sockopt_inherit.c
new file mode 100644
index 000000000000..1bf699815b9b
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockopt_inherit.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pthread.h>
+
+#include <linux/filter.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH "/sockopt_inherit"
+#define SOL_CUSTOM 0xdeadbeef
+#define CUSTOM_INHERIT1 0
+#define CUSTOM_INHERIT2 1
+#define CUSTOM_LISTENER 2
+
+static int connect_to_server(int server_fd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create client socket");
+ return -1;
+ }
+
+ if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get server addr");
+ goto out;
+ }
+
+ if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
+ log_err("Fail to connect to server");
+ goto out;
+ }
+
+ return fd;
+
+out:
+ close(fd);
+ return -1;
+}
+
+static int verify_sockopt(int fd, int optname, const char *msg, char expected)
+{
+ socklen_t optlen = 1;
+ char buf = 0;
+ int err;
+
+ err = getsockopt(fd, SOL_CUSTOM, optname, &buf, &optlen);
+ if (err) {
+ log_err("%s: failed to call getsockopt", msg);
+ return 1;
+ }
+
+ printf("%s %d: got=0x%x ? expected=0x%x\n", msg, optname, buf, expected);
+
+ if (buf != expected) {
+ log_err("%s: unexpected getsockopt value %d != %d", msg,
+ buf, expected);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void *server_thread(void *arg)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd = *(int *)arg;
+ int client_fd;
+ int err = 0;
+
+ if (listen(fd, 1) < 0)
+ error(1, errno, "Failed to listed on socket");
+
+ err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1);
+ err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1);
+ err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1);
+
+ client_fd = accept(fd, (struct sockaddr *)&addr, &len);
+ if (client_fd < 0)
+ error(1, errno, "Failed to accept client");
+
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1);
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1);
+ err += verify_sockopt(client_fd, CUSTOM_LISTENER, "accept", 0);
+
+ close(client_fd);
+
+ return (void *)(long)err;
+}
+
+static int start_server(void)
+{
+ struct sockaddr_in addr = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+ };
+ char buf;
+ int err;
+ int fd;
+ int i;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create server socket");
+ return -1;
+ }
+
+ for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) {
+ buf = 0x01;
+ err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1);
+ if (err) {
+ log_err("Failed to call setsockopt(%d)", i);
+ close(fd);
+ return -1;
+ }
+ }
+
+ if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ log_err("Failed to bind socket");
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+ enum bpf_attach_type attach_type;
+ enum bpf_prog_type prog_type;
+ struct bpf_program *prog;
+ int err;
+
+ err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+ if (err) {
+ log_err("Failed to deduct types for %s BPF program", title);
+ return -1;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, title);
+ if (!prog) {
+ log_err("Failed to find %s BPF program", title);
+ return -1;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
+ attach_type, 0);
+ if (err) {
+ log_err("Failed to attach %s BPF program", title);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int run_test(int cgroup_fd)
+{
+ struct bpf_prog_load_attr attr = {
+ .file = "./sockopt_inherit.o",
+ };
+ int server_fd = -1, client_fd;
+ struct bpf_object *obj;
+ void *server_err;
+ pthread_t tid;
+ int ignored;
+ int err;
+
+ err = bpf_prog_load_xattr(&attr, &obj, &ignored);
+ if (err) {
+ log_err("Failed to load BPF object");
+ return -1;
+ }
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
+ if (err)
+ goto close_bpf_object;
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
+ if (err)
+ goto close_bpf_object;
+
+ server_fd = start_server();
+ if (server_fd < 0) {
+ err = -1;
+ goto close_bpf_object;
+ }
+
+ pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
+
+ client_fd = connect_to_server(server_fd);
+ if (client_fd < 0) {
+ err = -1;
+ goto close_server_fd;
+ }
+
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0);
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0);
+ err += verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0);
+
+ pthread_join(tid, &server_err);
+
+ err += (int)(long)server_err;
+
+ close(client_fd);
+
+close_server_fd:
+ close(server_fd);
+close_bpf_object:
+ bpf_object__close(obj);
+ return err;
+}
+
+int main(int args, char **argv)
+{
+ int cgroup_fd;
+ int err = EXIT_SUCCESS;
+
+ if (setup_cgroup_environment())
+ return err;
+
+ cgroup_fd = create_and_get_cgroup(CG_PATH);
+ if (cgroup_fd < 0)
+ goto cleanup_cgroup_env;
+
+ if (join_cgroup(CG_PATH))
+ goto cleanup_cgroup;
+
+ if (run_test(cgroup_fd))
+ err = EXIT_FAILURE;
+
+ printf("test_sockopt_inherit: %s\n",
+ err == EXIT_SUCCESS ? "PASSED" : "FAILED");
+
+cleanup_cgroup:
+ close(cgroup_fd);
+cleanup_cgroup_env:
+ cleanup_cgroup_environment();
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
index a3bebd7c68dd..fc33ae36b760 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/test_sysctl.c
@@ -13,6 +13,7 @@
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
+#include "bpf_endian.h"
#include "bpf_rlimit.h"
#include "bpf_util.h"
#include "cgroup_helpers.h"
@@ -100,7 +101,7 @@ static struct sysctl_test tests[] = {
.descr = "ctx:write sysctl:write read ok",
.insns = {
/* If (write) */
- BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_1,
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
offsetof(struct bpf_sysctl, write)),
BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 1, 2),
@@ -214,7 +215,8 @@ static struct sysctl_test tests[] = {
/* if (ret == expected && */
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, sizeof("tcp_mem") - 1, 6),
/* buf == "tcp_mem\0") */
- BPF_LD_IMM64(BPF_REG_8, 0x006d656d5f706374ULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x7463705f6d656d00ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
@@ -255,7 +257,8 @@ static struct sysctl_test tests[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
/* buf[0:7] == "tcp_me\0") */
- BPF_LD_IMM64(BPF_REG_8, 0x00656d5f706374ULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x7463705f6d650000ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
@@ -298,12 +301,14 @@ static struct sysctl_test tests[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 16, 14),
/* buf[0:8] == "net/ipv4" && */
- BPF_LD_IMM64(BPF_REG_8, 0x347670692f74656eULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x6e65742f69707634ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
/* buf[8:16] == "/tcp_mem" && */
- BPF_LD_IMM64(BPF_REG_8, 0x6d656d5f7063742fULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x2f7463705f6d656dULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
@@ -350,12 +355,14 @@ static struct sysctl_test tests[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 10),
/* buf[0:8] == "net/ipv4" && */
- BPF_LD_IMM64(BPF_REG_8, 0x347670692f74656eULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x6e65742f69707634ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
/* buf[8:16] == "/tcp_me\0") */
- BPF_LD_IMM64(BPF_REG_8, 0x00656d5f7063742fULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x2f7463705f6d6500ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
@@ -396,7 +403,8 @@ static struct sysctl_test tests[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
/* buf[0:8] == "net/ip\0") */
- BPF_LD_IMM64(BPF_REG_8, 0x000070692f74656eULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x6e65742f69700000ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
@@ -431,7 +439,8 @@ static struct sysctl_test tests[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
/* buf[0:6] == "Linux\n\0") */
- BPF_LD_IMM64(BPF_REG_8, 0x000a78756e694cULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
@@ -469,7 +478,8 @@ static struct sysctl_test tests[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
/* buf[0:6] == "Linux\n\0") */
- BPF_LD_IMM64(BPF_REG_8, 0x000a78756e694cULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
@@ -507,7 +517,8 @@ static struct sysctl_test tests[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
/* buf[0:6] == "Linux\0") */
- BPF_LD_IMM64(BPF_REG_8, 0x000078756e694cULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x4c696e7578000000ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
@@ -650,7 +661,8 @@ static struct sysctl_test tests[] = {
/* buf[0:4] == "606\0") */
BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
- BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0x00363036, 2),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
+ bpf_ntohl(0x36303600), 2),
/* return DENY; */
BPF_MOV64_IMM(BPF_REG_0, 0),
@@ -685,17 +697,20 @@ static struct sysctl_test tests[] = {
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 23, 14),
/* buf[0:8] == "3000000 " && */
- BPF_LD_IMM64(BPF_REG_8, 0x2030303030303033ULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x3330303030303020ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
/* buf[8:16] == "4000000 " && */
- BPF_LD_IMM64(BPF_REG_8, 0x2030303030303034ULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x3430303030303020ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
/* buf[16:24] == "6000000\0") */
- BPF_LD_IMM64(BPF_REG_8, 0x0030303030303036ULL),
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x3630303030303000ULL)),
BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 16),
BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
@@ -735,7 +750,8 @@ static struct sysctl_test tests[] = {
/* buf[0:3] == "60\0") */
BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
- BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0x003036, 2),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
+ bpf_ntohl(0x36300000), 2),
/* return DENY; */
BPF_MOV64_IMM(BPF_REG_0, 0),
@@ -757,7 +773,8 @@ static struct sysctl_test tests[] = {
/* sysctl_set_new_value arg2 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x36303000)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
@@ -791,7 +808,7 @@ static struct sysctl_test tests[] = {
/* sysctl_set_new_value arg2 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, FIXUP_SYSCTL_VALUE),
+ BPF_LD_IMM64(BPF_REG_0, FIXUP_SYSCTL_VALUE),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
@@ -825,8 +842,9 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
- BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x36303000)),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -869,7 +887,8 @@ static struct sysctl_test tests[] = {
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
/* "600 602\0" */
- BPF_LD_IMM64(BPF_REG_0, 0x0032303620303036ULL),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3630302036303200ULL)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -937,7 +956,8 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x36303000)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -969,8 +989,9 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, 0x00373730),
- BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x30373700)),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1012,7 +1033,8 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x36303000)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1052,7 +1074,8 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, 0x090a0c0d),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x0d0c0a09)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1092,7 +1115,9 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, 0x00362d0a), /* " -6\0" */
+ /* " -6\0" */
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x0a2d3600)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1132,8 +1157,10 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, 0x00362d0a), /* " -6\0" */
- BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ /* " -6\0" */
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x0a2d3600)),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1175,8 +1202,10 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
- BPF_MOV64_IMM(BPF_REG_0, 0x65667830), /* "0xfe" */
- BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ /* "0xfe" */
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x30786665)),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1218,11 +1247,14 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) 9223372036854775807 */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
- BPF_LD_IMM64(BPF_REG_0, 0x3032373333323239ULL),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3932323333373230ULL)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
- BPF_LD_IMM64(BPF_REG_0, 0x3537373435383633ULL),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3336383534373735ULL)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
- BPF_LD_IMM64(BPF_REG_0, 0x0000000000373038ULL),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3830370000000000ULL)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1266,11 +1298,14 @@ static struct sysctl_test tests[] = {
/* arg1 (buf) 9223372036854775808 */
BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
- BPF_LD_IMM64(BPF_REG_0, 0x3032373333323239ULL),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3932323333373230ULL)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
- BPF_LD_IMM64(BPF_REG_0, 0x3537373435383633ULL),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3336383534373735ULL)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
- BPF_LD_IMM64(BPF_REG_0, 0x0000000000383038ULL),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3830380000000000ULL)),
BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1344,20 +1379,24 @@ static size_t probe_prog_length(const struct bpf_insn *fp)
static int fixup_sysctl_value(const char *buf, size_t buf_len,
struct bpf_insn *prog, size_t insn_num)
{
- uint32_t value_num = 0;
+ union {
+ uint8_t raw[sizeof(uint64_t)];
+ uint64_t num;
+ } value = {};
uint8_t c, i;
- if (buf_len > sizeof(value_num)) {
+ if (buf_len > sizeof(value)) {
log_err("Value is too big (%zd) to use in fixup", buf_len);
return -1;
}
-
- for (i = 0; i < buf_len; ++i) {
- c = buf[i];
- value_num |= (c << i * 8);
+ if (prog[insn_num].code != (BPF_LD | BPF_DW | BPF_IMM)) {
+ log_err("Can fixup only BPF_LD_IMM64 insns");
+ return -1;
}
- prog[insn_num].imm = value_num;
+ memcpy(value.raw, buf, buf_len);
+ prog[insn_num].imm = (uint32_t)value.num;
+ prog[insn_num + 1].imm = (uint32_t)(value.num >> 32);
return 0;
}
@@ -1499,6 +1538,7 @@ static int run_test_case(int cgfd, struct sysctl_test *test)
goto err;
}
+ errno = 0;
if (access_sysctl(sysctl_path, test) == -1) {
if (test->result == OP_EPERM && errno == EPERM)
goto out;
@@ -1507,7 +1547,7 @@ static int run_test_case(int cgfd, struct sysctl_test *test)
}
if (test->result != SUCCESS) {
- log_err("Unexpected failure");
+ log_err("Unexpected success");
goto err;
}
diff --git a/tools/testing/selftests/bpf/test_tcp_rtt.c b/tools/testing/selftests/bpf/test_tcp_rtt.c
index 90c3862f74a8..93916a69823e 100644
--- a/tools/testing/selftests/bpf/test_tcp_rtt.c
+++ b/tools/testing/selftests/bpf/test_tcp_rtt.c
@@ -6,6 +6,7 @@
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
+#include <netinet/tcp.h>
#include <pthread.h>
#include <linux/filter.h>
@@ -34,6 +35,30 @@ static void send_byte(int fd)
error(1, errno, "Failed to send single byte");
}
+static int wait_for_ack(int fd, int retries)
+{
+ struct tcp_info info;
+ socklen_t optlen;
+ int i, err;
+
+ for (i = 0; i < retries; i++) {
+ optlen = sizeof(info);
+ err = getsockopt(fd, SOL_TCP, TCP_INFO, &info, &optlen);
+ if (err < 0) {
+ log_err("Failed to lookup TCP stats");
+ return err;
+ }
+
+ if (info.tcpi_unacked == 0)
+ return 0;
+
+ usleep(10);
+ }
+
+ log_err("Did not receive ACK");
+ return -1;
+}
+
static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
__u32 dsack_dups, __u32 delivered, __u32 delivered_ce,
__u32 icsk_retransmits)
@@ -149,6 +174,11 @@ static int run_test(int cgroup_fd, int server_fd)
/*icsk_retransmits=*/0);
send_byte(client_fd);
+ if (wait_for_ack(client_fd, 100) < 0) {
+ err = -1;
+ goto close_client_fd;
+ }
+
err += verify_sk(map_fd, client_fd, "first payload byte",
/*invoked=*/2,
@@ -157,6 +187,7 @@ static int run_test(int cgroup_fd, int server_fd)
/*delivered_ce=*/0,
/*icsk_retransmits=*/0);
+close_client_fd:
close(client_fd);
close_bpf_object:
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 44e2d640b088..d27fd929abb9 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -61,6 +61,7 @@
#define UNPRIV_SYSCTL "kernel/unprivileged_bpf_disabled"
static bool unpriv_disabled = false;
static int skips;
+static bool verbose = false;
struct bpf_test {
const char *descr;
@@ -92,7 +93,8 @@ struct bpf_test {
enum {
UNDEF,
ACCEPT,
- REJECT
+ REJECT,
+ VERBOSE_ACCEPT,
} result, result_unpriv;
enum bpf_prog_type prog_type;
uint8_t flags;
@@ -859,6 +861,36 @@ static int do_prog_test_run(int fd_prog, bool unpriv, uint32_t expected_val,
return 0;
}
+static bool cmp_str_seq(const char *log, const char *exp)
+{
+ char needle[80];
+ const char *p, *q;
+ int len;
+
+ do {
+ p = strchr(exp, '\t');
+ if (!p)
+ p = exp + strlen(exp);
+
+ len = p - exp;
+ if (len >= sizeof(needle) || !len) {
+ printf("FAIL\nTestcase bug\n");
+ return false;
+ }
+ strncpy(needle, exp, len);
+ needle[len] = 0;
+ q = strstr(log, needle);
+ if (!q) {
+ printf("FAIL\nUnexpected verifier log in successful load!\n"
+ "EXP: %s\nRES:\n", needle);
+ return false;
+ }
+ log = q + len;
+ exp = p + 1;
+ } while (*p);
+ return true;
+}
+
static void do_test_single(struct bpf_test *test, bool unpriv,
int *passes, int *errors)
{
@@ -897,14 +929,20 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
pflags |= BPF_F_STRICT_ALIGNMENT;
if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS)
pflags |= BPF_F_ANY_ALIGNMENT;
+ if (test->flags & ~3)
+ pflags |= test->flags;
+ expected_ret = unpriv && test->result_unpriv != UNDEF ?
+ test->result_unpriv : test->result;
+ expected_err = unpriv && test->errstr_unpriv ?
+ test->errstr_unpriv : test->errstr;
memset(&attr, 0, sizeof(attr));
attr.prog_type = prog_type;
attr.expected_attach_type = test->expected_attach_type;
attr.insns = prog;
attr.insns_cnt = prog_len;
attr.license = "GPL";
- attr.log_level = 4;
+ attr.log_level = verbose || expected_ret == VERBOSE_ACCEPT ? 1 : 4;
attr.prog_flags = pflags;
fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog));
@@ -914,14 +952,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
goto close_fds;
}
- expected_ret = unpriv && test->result_unpriv != UNDEF ?
- test->result_unpriv : test->result;
- expected_err = unpriv && test->errstr_unpriv ?
- test->errstr_unpriv : test->errstr;
-
alignment_prevented_execution = 0;
- if (expected_ret == ACCEPT) {
+ if (expected_ret == ACCEPT || expected_ret == VERBOSE_ACCEPT) {
if (fd_prog < 0) {
printf("FAIL\nFailed to load prog '%s'!\n",
strerror(errno));
@@ -932,6 +965,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
(test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS))
alignment_prevented_execution = 1;
#endif
+ if (expected_ret == VERBOSE_ACCEPT && !cmp_str_seq(bpf_vlog, expected_err)) {
+ goto fail_log;
+ }
} else {
if (fd_prog >= 0) {
printf("FAIL\nUnexpected success to load!\n");
@@ -957,6 +993,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
}
}
+ if (verbose)
+ printf(", verifier log:\n%s", bpf_vlog);
+
run_errs = 0;
run_successes = 0;
if (!alignment_prevented_execution && fd_prog >= 0) {
@@ -1097,17 +1136,24 @@ int main(int argc, char **argv)
{
unsigned int from = 0, to = ARRAY_SIZE(tests);
bool unpriv = !is_admin();
+ int arg = 1;
+
+ if (argc > 1 && strcmp(argv[1], "-v") == 0) {
+ arg++;
+ verbose = true;
+ argc--;
+ }
if (argc == 3) {
- unsigned int l = atoi(argv[argc - 2]);
- unsigned int u = atoi(argv[argc - 1]);
+ unsigned int l = atoi(argv[arg]);
+ unsigned int u = atoi(argv[arg + 1]);
if (l < to && u < to) {
from = l;
to = u + 1;
}
} else if (argc == 2) {
- unsigned int t = atoi(argv[argc - 1]);
+ unsigned int t = atoi(argv[arg]);
if (t < to) {
from = t;
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
new file mode 100644
index 000000000000..02151f8c940f
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -0,0 +1,194 @@
+{
+ "precise: test 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_FD(BPF_REG_6, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8), /* map_value_ptr -= map_value_ptr */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_9),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_2, 8, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), /* R2=inv(umin=1, umax=8) */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .fixup_map_array_48b = { 1 },
+ .result = VERBOSE_ACCEPT,
+ .errstr =
+ "26: (85) call bpf_probe_read#4\
+ last_idx 26 first_idx 20\
+ regs=4 stack=0 before 25\
+ regs=4 stack=0 before 24\
+ regs=4 stack=0 before 23\
+ regs=4 stack=0 before 22\
+ regs=4 stack=0 before 20\
+ parent didn't have regs=4 stack=0 marks\
+ last_idx 19 first_idx 10\
+ regs=4 stack=0 before 19\
+ regs=200 stack=0 before 18\
+ regs=300 stack=0 before 17\
+ regs=201 stack=0 before 15\
+ regs=201 stack=0 before 14\
+ regs=200 stack=0 before 13\
+ regs=200 stack=0 before 12\
+ regs=200 stack=0 before 11\
+ regs=200 stack=0 before 10\
+ parent already had regs=0 stack=0 marks",
+},
+{
+ "precise: test 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_FD(BPF_REG_6, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8), /* map_value_ptr -= map_value_ptr */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_9),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_2, 8, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), /* R2=inv(umin=1, umax=8) */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .fixup_map_array_48b = { 1 },
+ .result = VERBOSE_ACCEPT,
+ .flags = BPF_F_TEST_STATE_FREQ,
+ .errstr =
+ "26: (85) call bpf_probe_read#4\
+ last_idx 26 first_idx 22\
+ regs=4 stack=0 before 25\
+ regs=4 stack=0 before 24\
+ regs=4 stack=0 before 23\
+ regs=4 stack=0 before 22\
+ parent didn't have regs=4 stack=0 marks\
+ last_idx 20 first_idx 20\
+ regs=4 stack=0 before 20\
+ parent didn't have regs=4 stack=0 marks\
+ last_idx 19 first_idx 17\
+ regs=4 stack=0 before 19\
+ regs=200 stack=0 before 18\
+ regs=300 stack=0 before 17\
+ parent already had regs=0 stack=0 marks",
+},
+{
+ "precise: cross frame pruning",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_8, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_IMM(BPF_REG_9, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_9, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = BPF_F_TEST_STATE_FREQ,
+ .errstr = "!read_ok",
+ .result = REJECT,
+},
+{
+ "precise: ST insn causing spi > allocated_stack",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_3, -8, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = BPF_F_TEST_STATE_FREQ,
+ .errstr = "5: (2d) if r4 > r0 goto pc+0\
+ last_idx 5 first_idx 5\
+ parent didn't have regs=10 stack=0 marks\
+ last_idx 4 first_idx 2\
+ regs=10 stack=0 before 4\
+ regs=10 stack=0 before 3\
+ regs=0 stack=1 before 2\
+ last_idx 5 first_idx 5\
+ parent didn't have regs=1 stack=0 marks",
+ .result = VERBOSE_ACCEPT,
+ .retval = -1,
+},
+{
+ "precise: STX insn causing spi > allocated_stack",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = BPF_F_TEST_STATE_FREQ,
+ .errstr = "last_idx 6 first_idx 6\
+ parent didn't have regs=10 stack=0 marks\
+ last_idx 5 first_idx 3\
+ regs=10 stack=0 before 5\
+ regs=10 stack=0 before 4\
+ regs=0 stack=1 before 3\
+ last_idx 6 first_idx 6\
+ parent didn't have regs=1 stack=0 marks\
+ last_idx 5 first_idx 3\
+ regs=1 stack=0 before 5",
+ .result = VERBOSE_ACCEPT,
+ .retval = -1,
+},