aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/bpf/index.rst1
-rw-r--r--Documentation/bpf/prog_cgroup_sockopt.rst93
-rw-r--r--Documentation/networking/af_xdp.rst16
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_xsk.c12
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c15
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/Makefile2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h155
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/params.c108
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/params.h118
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c231
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h36
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/Makefile1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c192
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h27
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c223
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h25
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c111
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h15
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c267
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.h31
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c25
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c18
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c730
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rep.c12
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx.c104
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_stats.c115
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_stats.h30
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c42
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c14
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/wq.h5
-rw-r--r--drivers/net/veth.c60
-rw-r--r--include/linux/bpf-cgroup.h45
-rw-r--r--include/linux/bpf.h2
-rw-r--r--include/linux/bpf_types.h1
-rw-r--r--include/linux/filter.h13
-rw-r--r--include/linux/list.h14
-rw-r--r--include/net/tcp.h8
-rw-r--r--include/net/xdp_sock.h27
-rw-r--r--include/trace/events/xdp.h34
-rw-r--r--include/uapi/linux/bpf.h33
-rw-r--r--include/uapi/linux/if_xdp.h8
-rw-r--r--kernel/bpf/cgroup.c352
-rw-r--r--kernel/bpf/core.c10
-rw-r--r--kernel/bpf/cpumap.c105
-rw-r--r--kernel/bpf/devmap.c112
-rw-r--r--kernel/bpf/syscall.c19
-rw-r--r--kernel/bpf/verifier.c136
-rw-r--r--kernel/bpf/xskmap.c3
-rw-r--r--kernel/trace/bpf_trace.c27
-rw-r--r--net/core/filter.c269
-rw-r--r--net/core/xdp.c2
-rw-r--r--net/ipv4/tcp_input.c4
-rw-r--r--net/socket.c30
-rw-r--r--net/xdp/xsk.c36
-rw-r--r--net/xdp/xsk_queue.h14
-rw-r--r--samples/bpf/Makefile3
-rwxr-xr-xsamples/bpf/do_hbm_test.sh22
-rw-r--r--samples/bpf/hbm.c18
-rw-r--r--samples/bpf/hbm_edt_kern.c168
-rw-r--r--samples/bpf/hbm_kern.h40
-rw-r--r--samples/bpf/ibumad_kern.c18
-rw-r--r--samples/bpf/tcp_bpf.readme2
-rw-r--r--samples/bpf/tcp_dumpstats_kern.c68
-rw-r--r--samples/bpf/xdp_adjust_tail_user.c12
-rw-r--r--samples/bpf/xdp_redirect_map_user.c15
-rw-r--r--samples/bpf/xdp_redirect_user.c15
-rw-r--r--samples/bpf/xdp_tx_iptunnel_user.c12
-rw-r--r--samples/bpf/xdpsock_user.c44
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-cgroup.rst7
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-prog.rst3
-rw-r--r--tools/bpf/bpftool/bash-completion/bpftool9
-rw-r--r--tools/bpf/bpftool/cgroup.c5
-rw-r--r--tools/bpf/bpftool/main.h1
-rw-r--r--tools/bpf/bpftool/prog.c3
-rw-r--r--tools/include/uapi/linux/bpf.h26
-rw-r--r--tools/include/uapi/linux/if_xdp.h8
-rw-r--r--tools/lib/bpf/libbpf.c23
-rw-r--r--tools/lib/bpf/libbpf_probes.c1
-rw-r--r--tools/lib/bpf/xsk.c15
-rw-r--r--tools/lib/bpf/xsk.h2
-rw-r--r--tools/testing/selftests/bpf/.gitignore3
-rw-r--r--tools/testing/selftests/bpf/Makefile10
-rw-r--r--tools/testing/selftests/bpf/progs/pyperf.h9
-rw-r--r--tools/testing/selftests/bpf/progs/sockopt_multi.c71
-rw-r--r--tools/testing/selftests/bpf/progs/sockopt_sk.c111
-rw-r--r--tools/testing/selftests/bpf/progs/strobemeta.h36
-rw-r--r--tools/testing/selftests/bpf/progs/tcp_rtt.c61
-rw-r--r--tools/testing/selftests/bpf/progs/test_jhash.h3
-rw-r--r--tools/testing/selftests/bpf/progs/test_seg6_loop.c23
-rw-r--r--tools/testing/selftests/bpf/progs/test_verif_scale2.c2
-rw-r--r--tools/testing/selftests/bpf/progs/xdp_redirect_map.c31
-rw-r--r--tools/testing/selftests/bpf/progs/xdp_tx.c12
-rw-r--r--tools/testing/selftests/bpf/test_section_names.c10
-rw-r--r--tools/testing/selftests/bpf/test_sockopt.c1021
-rw-r--r--tools/testing/selftests/bpf/test_sockopt_multi.c374
-rw-r--r--tools/testing/selftests/bpf/test_sockopt_sk.c211
-rw-r--r--tools/testing/selftests/bpf/test_tcp_rtt.c254
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_veth.sh118
98 files changed, 6197 insertions, 841 deletions
diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst
index d3fe4cac0c90..801a6ed3f2e5 100644
--- a/Documentation/bpf/index.rst
+++ b/Documentation/bpf/index.rst
@@ -42,6 +42,7 @@ Program types
.. toctree::
:maxdepth: 1
+ prog_cgroup_sockopt
prog_cgroup_sysctl
prog_flow_dissector
diff --git a/Documentation/bpf/prog_cgroup_sockopt.rst b/Documentation/bpf/prog_cgroup_sockopt.rst
new file mode 100644
index 000000000000..c47d974629ae
--- /dev/null
+++ b/Documentation/bpf/prog_cgroup_sockopt.rst
@@ -0,0 +1,93 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================
+BPF_PROG_TYPE_CGROUP_SOCKOPT
+============================
+
+``BPF_PROG_TYPE_CGROUP_SOCKOPT`` program type can be attached to two
+cgroup hooks:
+
+* ``BPF_CGROUP_GETSOCKOPT`` - called every time process executes ``getsockopt``
+ system call.
+* ``BPF_CGROUP_SETSOCKOPT`` - called every time process executes ``setsockopt``
+ system call.
+
+The context (``struct bpf_sockopt``) has associated socket (``sk``) and
+all input arguments: ``level``, ``optname``, ``optval`` and ``optlen``.
+
+BPF_CGROUP_SETSOCKOPT
+=====================
+
+``BPF_CGROUP_SETSOCKOPT`` is triggered *before* the kernel handling of
+sockopt and it has writable context: it can modify the supplied arguments
+before passing them down to the kernel. This hook has access to the cgroup
+and socket local storage.
+
+If BPF program sets ``optlen`` to -1, the control will be returned
+back to the userspace after all other BPF programs in the cgroup
+chain finish (i.e. kernel ``setsockopt`` handling will *not* be executed).
+
+Note, that ``optlen`` can not be increased beyond the user-supplied
+value. It can only be decreased or set to -1. Any other value will
+trigger ``EFAULT``.
+
+Return Type
+-----------
+
+* ``0`` - reject the syscall, ``EPERM`` will be returned to the userspace.
+* ``1`` - success, continue with next BPF program in the cgroup chain.
+
+BPF_CGROUP_GETSOCKOPT
+=====================
+
+``BPF_CGROUP_GETSOCKOPT`` is triggered *after* the kernel handing of
+sockopt. The BPF hook can observe ``optval``, ``optlen`` and ``retval``
+if it's interested in whatever kernel has returned. BPF hook can override
+the values above, adjust ``optlen`` and reset ``retval`` to 0. If ``optlen``
+has been increased above initial ``getsockopt`` value (i.e. userspace
+buffer is too small), ``EFAULT`` is returned.
+
+This hook has access to the cgroup and socket local storage.
+
+Note, that the only acceptable value to set to ``retval`` is 0 and the
+original value that the kernel returned. Any other value will trigger
+``EFAULT``.
+
+Return Type
+-----------
+
+* ``0`` - reject the syscall, ``EPERM`` will be returned to the userspace.
+* ``1`` - success: copy ``optval`` and ``optlen`` to userspace, return
+ ``retval`` from the syscall (note that this can be overwritten by
+ the BPF program from the parent cgroup).
+
+Cgroup Inheritance
+==================
+
+Suppose, there is the following cgroup hierarchy where each cgroup
+has ``BPF_CGROUP_GETSOCKOPT`` attached at each level with
+``BPF_F_ALLOW_MULTI`` flag::
+
+ A (root, parent)
+ \
+ B (child)
+
+When the application calls ``getsockopt`` syscall from the cgroup B,
+the programs are executed from the bottom up: B, A. First program
+(B) sees the result of kernel's ``getsockopt``. It can optionally
+adjust ``optval``, ``optlen`` and reset ``retval`` to 0. After that
+control will be passed to the second (A) program which will see the
+same context as B including any potential modifications.
+
+Same for ``BPF_CGROUP_SETSOCKOPT``: if the program is attached to
+A and B, the trigger order is B, then A. If B does any changes
+to the input arguments (``level``, ``optname``, ``optval``, ``optlen``),
+then the next program in the chain (A) will see those changes,
+*not* the original input ``setsockopt`` arguments. The potentially
+modified values will be then passed down to the kernel.
+
+Example
+=======
+
+See ``tools/testing/selftests/bpf/progs/sockopt_sk.c`` for an example
+of BPF program that handles socket options.
diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index 50bccbf68308..eeedc2e826aa 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -220,7 +220,21 @@ Usage
In order to use AF_XDP sockets there are two parts needed. The
user-space application and the XDP program. For a complete setup and
usage example, please refer to the sample application. The user-space
-side is xdpsock_user.c and the XDP side xdpsock_kern.c.
+side is xdpsock_user.c and the XDP side is part of libbpf.
+
+The XDP code sample included in tools/lib/bpf/xsk.c is the following::
+
+ SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
+ {
+ int index = ctx->rx_queue_index;
+
+ // A set entry here means that the correspnding queue_id
+ // has an active AF_XDP socket bound to it.
+ if (bpf_map_lookup_elem(&xsks_map, &index))
+ return bpf_redirect_map(&xsks_map, index, 0);
+
+ return XDP_PASS;
+ }
Naive ring dequeue and enqueue could look like this::
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 557c565c26fc..32bad014d76c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -641,8 +641,8 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
struct i40e_tx_desc *tx_desc = NULL;
struct i40e_tx_buffer *tx_bi;
bool work_done = true;
+ struct xdp_desc desc;
dma_addr_t dma;
- u32 len;
while (budget-- > 0) {
if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
@@ -651,21 +651,23 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
break;
}
- if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &dma, &len))
+ if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
- dma_sync_single_for_device(xdp_ring->dev, dma, len,
+ dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
+
+ dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
DMA_BIDIRECTIONAL);
tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
- tx_bi->bytecount = len;
+ tx_bi->bytecount = desc.len;
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
tx_desc->buffer_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz =
build_ctob(I40E_TX_DESC_CMD_ICRC
| I40E_TX_DESC_CMD_EOP,
- 0, len, 0);
+ 0, desc.len, 0);
xdp_ring->next_to_use++;
if (xdp_ring->next_to_use == xdp_ring->count)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index 6af55bb3bef3..6b609553329f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -571,8 +571,9 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
union ixgbe_adv_tx_desc *tx_desc = NULL;
struct ixgbe_tx_buffer *tx_bi;
bool work_done = true;
- u32 len, cmd_type;
+ struct xdp_desc desc;
dma_addr_t dma;
+ u32 cmd_type;
while (budget-- > 0) {
if (unlikely(!ixgbe_desc_unused(xdp_ring)) ||
@@ -581,14 +582,16 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
break;
}
- if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &dma, &len))
+ if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
- dma_sync_single_for_device(xdp_ring->dev, dma, len,
+ dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
+
+ dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
DMA_BIDIRECTIONAL);
tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use];
- tx_bi->bytecount = len;
+ tx_bi->bytecount = desc.len;
tx_bi->xdpf = NULL;
tx_bi->gso_segs = 1;
@@ -599,10 +602,10 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
cmd_type = IXGBE_ADVTXD_DTYP_DATA |
IXGBE_ADVTXD_DCMD_DEXT |
IXGBE_ADVTXD_DCMD_IFCS;
- cmd_type |= len | IXGBE_TXD_CMD;
+ cmd_type |= desc.len | IXGBE_TXD_CMD;
tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
tx_desc->read.olinfo_status =
- cpu_to_le32(len << IXGBE_ADVTXD_PAYLEN_SHIFT);
+ cpu_to_le32(desc.len << IXGBE_ADVTXD_PAYLEN_SHIFT);
xdp_ring->next_to_use++;
if (xdp_ring->next_to_use == xdp_ring->count)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 6aa0fe6de878..8456b19d79cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -24,7 +24,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o \
- en/params.o
+ en/params.o en/xsk/umem.o en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o
#
# Netdev extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3281ace5e126..f0d77eb66acf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -137,6 +137,7 @@ struct page_pool;
#define MLX5E_MAX_NUM_CHANNELS (MLX5E_INDIR_RQT_SIZE >> 1)
#define MLX5E_MAX_NUM_SQS (MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC)
#define MLX5E_TX_CQ_POLL_BUDGET 128
+#define MLX5E_TX_XSK_POLL_BUDGET 64
#define MLX5E_SQ_RECOVER_MIN_INTERVAL 500 /* msecs */
#define MLX5E_UMR_WQE_INLINE_SZ \
@@ -155,6 +156,11 @@ do { \
##__VA_ARGS__); \
} while (0)
+enum mlx5e_rq_group {
+ MLX5E_RQ_GROUP_REGULAR,
+ MLX5E_RQ_GROUP_XSK,
+ MLX5E_NUM_RQ_GROUPS /* Keep last. */
+};
static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
{
@@ -179,7 +185,8 @@ static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
/* Use this function to get max num channels after netdev was created */
static inline int mlx5e_get_netdev_max_channels(struct net_device *netdev)
{
- return min_t(unsigned int, netdev->num_rx_queues,
+ return min_t(unsigned int,
+ netdev->num_rx_queues / MLX5E_NUM_RQ_GROUPS,
netdev->num_tx_queues);
}
@@ -250,6 +257,7 @@ struct mlx5e_params {
u32 lro_timeout;
u32 pflags;
struct bpf_prog *xdp_prog;
+ struct mlx5e_xsk *xsk;
unsigned int sw_mtu;
int hard_mtu;
};
@@ -348,6 +356,13 @@ enum {
struct mlx5e_sq_wqe_info {
u8 opcode;
+
+ /* Auxiliary data for different opcodes. */
+ union {
+ struct {
+ struct mlx5e_rq *rq;
+ } umr;
+ };
};
struct mlx5e_txqsq {
@@ -392,14 +407,55 @@ struct mlx5e_txqsq {
} ____cacheline_aligned_in_smp;
struct mlx5e_dma_info {
- struct page *page;
- dma_addr_t addr;
+ dma_addr_t addr;
+ union {
+ struct page *page;
+ struct {
+ u64 handle;
+ void *data;
+ } xsk;
+ };
+};
+
+/* XDP packets can be transmitted in different ways. On completion, we need to
+ * distinguish between them to clean up things in a proper way.
+ */
+enum mlx5e_xdp_xmit_mode {
+ /* An xdp_frame was transmitted due to either XDP_REDIRECT from another
+ * device or XDP_TX from an XSK RQ. The frame has to be unmapped and
+ * returned.
+ */
+ MLX5E_XDP_XMIT_MODE_FRAME,
+
+ /* The xdp_frame was created in place as a result of XDP_TX from a
+ * regular RQ. No DMA remapping happened, and the page belongs to us.
+ */
+ MLX5E_XDP_XMIT_MODE_PAGE,
+
+ /* No xdp_frame was created at all, the transmit happened from a UMEM
+ * page. The UMEM Completion Ring producer pointer has to be increased.
+ */
+ MLX5E_XDP_XMIT_MODE_XSK,
};
struct mlx5e_xdp_info {
- struct xdp_frame *xdpf;
- dma_addr_t dma_addr;
- struct mlx5e_dma_info di;
+ enum mlx5e_xdp_xmit_mode mode;
+ union {
+ struct {
+ struct xdp_frame *xdpf;
+ dma_addr_t dma_addr;
+ } frame;
+ struct {
+ struct mlx5e_rq *rq;
+ struct mlx5e_dma_info di;
+ } page;
+ };
+};
+
+struct mlx5e_xdp_xmit_data {
+ dma_addr_t dma_addr;
+ void *data;
+ u32 len;
};
struct mlx5e_xdp_info_fifo {
@@ -425,8 +481,12 @@ struct mlx5e_xdp_mpwqe {
};
struct mlx5e_xdpsq;
-typedef bool (*mlx5e_fp_xmit_xdp_frame)(struct mlx5e_xdpsq*,
- struct mlx5e_xdp_info*);
+typedef int (*mlx5e_fp_xmit_xdp_frame_check)(struct mlx5e_xdpsq *);
+typedef bool (*mlx5e_fp_xmit_xdp_frame)(struct mlx5e_xdpsq *,
+ struct mlx5e_xdp_xmit_data *,
+ struct mlx5e_xdp_info *,
+ int);
+
struct mlx5e_xdpsq {
/* data path */
@@ -443,8 +503,10 @@ struct mlx5e_xdpsq {
struct mlx5e_cq cq;
/* read only */
+ struct xdp_umem *umem;
struct mlx5_wq_cyc wq;
struct mlx5e_xdpsq_stats *stats;
+ mlx5e_fp_xmit_xdp_frame_check xmit_xdp_frame_check;
mlx5e_fp_xmit_xdp_frame xmit_xdp_frame;
struct {
struct mlx5e_xdp_wqe_info *wqe_info;
@@ -571,9 +633,11 @@ struct mlx5e_rq {
u8 log_stride_sz;
u8 umr_in_progress;
u8 umr_last_bulk;
+ u8 umr_completed;
} mpwqe;
};
struct {
+ u16 umem_headroom;
u16 headroom;
u8 map_dir; /* dma map direction */
} buff;
@@ -600,10 +664,14 @@ struct mlx5e_rq {
/* XDP */
struct bpf_prog *xdp_prog;
- struct mlx5e_xdpsq xdpsq;
+ struct mlx5e_xdpsq *xdpsq;
DECLARE_BITMAP(flags, 8);
struct page_pool *page_pool;
+ /* AF_XDP zero-copy */
+ struct zero_copy_allocator zca;
+ struct xdp_umem *umem;
+
/* control */
struct mlx5_wq_ctrl wq_ctrl;
__be32 mkey_be;
@@ -616,9 +684,15 @@ struct mlx5e_rq {
struct xdp_rxq_info xdp_rxq;
} ____cacheline_aligned_in_smp;
+enum mlx5e_channel_state {
+ MLX5E_CHANNEL_STATE_XSK,
+ MLX5E_CHANNEL_NUM_STATES
+};
+
struct mlx5e_channel {
/* data path */
struct mlx5e_rq rq;
+ struct mlx5e_xdpsq rq_xdpsq;
struct mlx5e_txqsq sq[MLX5E_MAX_NUM_TC];
struct mlx5e_icosq icosq; /* internal control operations */
bool xdp;
@@ -631,6 +705,13 @@ struct mlx5e_channel {
/* XDP_REDIRECT */
struct mlx5e_xdpsq xdpsq;
+ /* AF_XDP zero-copy */
+ struct mlx5e_rq xskrq;
+ struct mlx5e_xdpsq xsksq;
+ struct mlx5e_icosq xskicosq;
+ /* xskicosq can be accessed from any CPU - the spinlock protects it. */
+ spinlock_t xskicosq_lock;
+
/* data path - accessed per napi poll */
struct irq_desc *irq_desc;
struct mlx5e_ch_stats *stats;
@@ -639,6 +720,7 @@ struct mlx5e_channel {
struct mlx5e_priv *priv;
struct mlx5_core_dev *mdev;
struct hwtstamp_config *tstamp;
+ DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES);
int ix;
int cpu;
cpumask_var_t xps_cpumask;
@@ -654,14 +736,17 @@ struct mlx5e_channel_stats {
struct mlx5e_ch_stats ch;
struct mlx5e_sq_stats sq[MLX5E_MAX_NUM_TC];
struct mlx5e_rq_stats rq;
+ struct mlx5e_rq_stats xskrq;
struct mlx5e_xdpsq_stats rq_xdpsq;
struct mlx5e_xdpsq_stats xdpsq;
+ struct mlx5e_xdpsq_stats xsksq;
} ____cacheline_aligned_in_smp;
enum {
MLX5E_STATE_OPENED,
MLX5E_STATE_DESTROYING,
MLX5E_STATE_XDP_TX_ENABLED,
+ MLX5E_STATE_XDP_OPEN,
};
struct mlx5e_rqt {
@@ -694,6 +779,17 @@ struct mlx5e_modify_sq_param {
int rl_index;
};
+struct mlx5e_xsk {
+ /* UMEMs are stored separately from channels, because we don't want to
+ * lose them when channels are recreated. The kernel also stores UMEMs,
+ * but it doesn't distinguish between zero-copy and non-zero-copy UMEMs,
+ * so rely on our mechanism.
+ */
+ struct xdp_umem **umems;
+ u16 refcnt;
+ bool ever_used;
+};
+
struct mlx5e_priv {
/* priv data path fields - start */
struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
@@ -714,6 +810,7 @@ struct mlx5e_priv {
struct mlx5e_tir indir_tir[MLX5E_NUM_INDIR_TIRS];
struct mlx5e_tir inner_indir_tir[MLX5E_NUM_INDIR_TIRS];
struct mlx5e_tir direct_tir[MLX5E_MAX_NUM_CHANNELS];
+ struct mlx5e_tir xsk_tir[MLX5E_MAX_NUM_CHANNELS];
struct mlx5e_rss_params rss_params;
u32 tx_rates[MLX5E_MAX_NUM_SQS];
@@ -750,6 +847,7 @@ struct mlx5e_priv {
struct mlx5e_tls *tls;
#endif
struct devlink_health_reporter *tx_reporter;
+ struct mlx5e_xsk xsk;
};
struct mlx5e_profile {
@@ -794,11 +892,13 @@ bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev,
struct mlx5e_params *params);
void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info);
-void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
- bool recycle);
+void mlx5e_page_release_dynamic(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info,
+ bool recycle);
void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe);
bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
+void mlx5e_poll_ico_cq(struct mlx5e_cq *cq);
bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq);
void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix);
void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
@@ -854,6 +954,30 @@ void mlx5e_build_indir_tir_ctx_hash(struct mlx5e_rss_params *rss_params,
void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen);
struct mlx5e_tirc_config mlx5e_tirc_get_default_config(enum mlx5e_traffic_types tt);
+struct mlx5e_xsk_param;
+
+struct mlx5e_rq_param;
+int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_params *params,
+ struct mlx5e_rq_param *param, struct mlx5e_xsk_param *xsk,
+ struct xdp_umem *umem, struct mlx5e_rq *rq);
+int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time);
+void mlx5e_deactivate_rq(struct mlx5e_rq *rq);
+void mlx5e_close_rq(struct mlx5e_rq *rq);
+
+struct mlx5e_sq_param;
+int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
+ struct mlx5e_sq_param *param, struct mlx5e_icosq *sq);
+void mlx5e_close_icosq(struct mlx5e_icosq *sq);
+int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params,
+ struct mlx5e_sq_param *param, struct xdp_umem *umem,
+ struct mlx5e_xdpsq *sq, bool is_redirect);
+void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq);
+
+struct mlx5e_cq_param;
+int mlx5e_open_cq(struct mlx5e_channel *c, struct dim_cq_moder moder,
+ struct mlx5e_cq_param *param, struct mlx5e_cq *cq);
+void mlx5e_close_cq(struct mlx5e_cq *cq);
+
int mlx5e_open_locked(struct net_device *netdev);
int mlx5e_close_locked(struct net_device *netdev);
@@ -1024,10 +1148,10 @@ int mlx5e_create_indirect_rqt(struct mlx5e_priv *priv);
int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc);
void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc);
-int mlx5e_create_direct_rqts(struct mlx5e_priv *priv);
-void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv);
-int mlx5e_create_direct_tirs(struct mlx5e_priv *priv);
-void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv);
+int mlx5e_create_direct_rqts(struct mlx5e_priv *priv, struct mlx5e_tir *tirs);
+void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv, struct mlx5e_tir *tirs);
+int mlx5e_create_direct_tirs(struct mlx5e_priv *priv, struct mlx5e_tir *tirs);
+void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv, struct mlx5e_tir *tirs);
void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt);
int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
@@ -1097,6 +1221,7 @@ void mlx5e_detach_netdev(struct mlx5e_priv *priv);
void mlx5e_destroy_netdev(struct mlx5e_priv *priv);
void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv);
void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
+ struct mlx5e_xsk *xsk,
struct mlx5e_rss_params *rss_params,
struct mlx5e_params *params,
u16 max_channels, u16 mtu);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index d3744bffbae3..79301d116667 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -3,65 +3,102 @@
#include "en/params.h"
-u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params)
+static inline bool mlx5e_rx_is_xdp(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
{
- u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
- u16 linear_rq_headroom = params->xdp_prog ?
- XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
- u32 frag_sz;
+ return params->xdp_prog || xsk;
+}
+
+u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
+{
+ u16 headroom = NET_IP_ALIGN;
+
+ if (mlx5e_rx_is_xdp(params, xsk)) {
+ headroom += XDP_PACKET_HEADROOM;
+ if (xsk)
+ headroom += xsk->headroom;
+ } else {
+ headroom += MLX5_RX_HEADROOM;
+ }
+
+ return headroom;
+}
+
+u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
+{
+ u32 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+ u16 linear_rq_headroom = mlx5e_get_linear_rq_headroom(params, xsk);
+ u32 frag_sz = linear_rq_headroom + hw_mtu;
- linear_rq_headroom += NET_IP_ALIGN;
+ /* AF_XDP doesn't build SKBs in place. */
+ if (!xsk)
+ frag_sz = MLX5_SKB_FRAG_SZ(frag_sz);
- frag_sz = MLX5_SKB_FRAG_SZ(linear_rq_headroom + hw_mtu);
+ /* XDP in mlx5e doesn't support multiple packets per page. */
+ if (mlx5e_rx_is_xdp(params, xsk))
+ frag_sz = max_t(u32, frag_sz, PAGE_SIZE);
- if (params->xdp_prog && frag_sz < PAGE_SIZE)
- frag_sz = PAGE_SIZE;
+ /* Even if we can go with a smaller fragment size, we must not put
+ * multiple packets into a single frame.
+ */
+ if (xsk)
+ frag_sz = max_t(u32, frag_sz, xsk->chunk_size);
return frag_sz;
}
-u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
+u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
{
- u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params);
+ u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, xsk);
return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
}
-bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params)
+bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
{
- u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
+ /* AF_XDP allocates SKBs on XDP_PASS - ensure they don't occupy more
+ * than one page. For this, check both with and without xsk.
+ */
+ u32 linear_frag_sz = max(mlx5e_rx_get_linear_frag_sz(params, xsk),
+ mlx5e_rx_get_linear_frag_sz(params, NULL));
- return !params->lro_en && frag_sz <= PAGE_SIZE;
+ return !params->lro_en && linear_frag_sz <= PAGE_SIZE;
}
#define MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ ((BIT(__mlx5_bit_sz(wq, log_wqe_stride_size)) - 1) + \
MLX5_MPWQE_LOG_STRIDE_SZ_BASE)
bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
- struct mlx5e_params *params)
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
{
- u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
+ u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, xsk);
s8 signed_log_num_strides_param;
u8 log_num_strides;
- if (!mlx5e_rx_is_linear_skb(params))
+ if (!mlx5e_rx_is_linear_skb(params, xsk))
return false;
- if (order_base_2(frag_sz) > MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ)
+ if (order_base_2(linear_frag_sz) > MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ)
return false;
if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
return true;
- log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(frag_sz);
+ log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
signed_log_num_strides_param =
(s8)log_num_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE;
return signed_log_num_strides_param >= 0;
}
-u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params)
+u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
{
- u8 log_pkts_per_wqe = mlx5e_mpwqe_log_pkts_per_wqe(params);
+ u8 log_pkts_per_wqe = mlx5e_mpwqe_log_pkts_per_wqe(params, xsk);
/* Numbers are unsigned, don't subtract to avoid underflow. */
if (params->log_rq_mtu_frames <
@@ -72,33 +109,30 @@ u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params)
}
u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
- struct mlx5e_params *params)
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
{
- if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
- return order_base_2(mlx5e_rx_get_linear_frag_sz(params));
+ if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk))
+ return order_base_2(mlx5e_rx_get_linear_frag_sz(params, xsk));
return MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev);
}
u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev,
- struct mlx5e_params *params)
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
{
return MLX5_MPWRQ_LOG_WQE_SZ -
- mlx5e_mpwqe_get_log_stride_size(mdev, params);
+ mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk);
}
u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
- struct mlx5e_params *params)
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk)
{
- u16 linear_rq_headroom = params->xdp_prog ?
- XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
- bool is_linear_skb;
-
- linear_rq_headroom += NET_IP_ALIGN;
-
- is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ?
- mlx5e_rx_is_linear_skb(params) :
- mlx5e_rx_mpwqe_is_linear_skb(mdev, params);
+ bool is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ?
+ mlx5e_rx_is_linear_skb(params, xsk) :
+ mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk);
- return is_linear_skb ? linear_rq_headroom : 0;
+ return is_linear_skb ? mlx5e_get_linear_rq_headroom(params, xsk) : 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
index b106a0236f36..bd882b5ee9a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
@@ -6,17 +6,119 @@
#include "en.h"
-u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params);
-u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params);
-bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params);
+struct mlx5e_xsk_param {
+ u16 headroom;
+ u16 chunk_size;
+};
+
+struct mlx5e_rq_param {
+ u32 rqc[MLX5_ST_SZ_DW(rqc)];
+ struct mlx5_wq_param wq;
+ struct mlx5e_rq_frags_info frags_info;
+};
+
+struct mlx5e_sq_param {
+ u32 sqc[MLX5_ST_SZ_DW(sqc)];
+ struct mlx5_wq_param wq;
+ bool is_mpw;
+};
+
+struct mlx5e_cq_param {
+ u32 cqc[MLX5_ST_SZ_DW(cqc)];
+ struct mlx5_wq_param wq;
+ u16 eq_ix;
+ u8 cq_period_mode;
+};
+
+struct mlx5e_channel_param {
+ struct mlx5e_rq_param rq;
+ struct mlx5e_sq_param sq;
+ struct mlx5e_sq_param xdp_sq;
+ struct mlx5e_sq_param icosq;
+ struct mlx5e_cq_param rx_cq;
+ struct mlx5e_cq_param tx_cq;
+ struct mlx5e_cq_param icosq_cq;
+};
+
+static inline bool mlx5e_qid_get_ch_if_in_group(struct mlx5e_params *params,
+ u16 qid,
+ enum mlx5e_rq_group group,
+ u16 *ix)
+{
+ int nch = params->num_channels;
+ int ch = qid - nch * group;
+
+ if (ch < 0 || ch >= nch)
+ return false;
+
+ *ix = ch;
+ return true;
+}
+
+static inline void mlx5e_qid_get_ch_and_group(struct mlx5e_params *params,
+ u16 qid,
+ u16 *ix,
+ enum mlx5e_rq_group *group)
+{
+ u16 nch = params->num_channels;
+
+ *ix = qid % nch;
+ *group = qid / nch;
+}
+
+static inline bool mlx5e_qid_validate(struct mlx5e_params *params, u64 qid)
+{
+ return qid < params->num_channels * MLX5E_NUM_RQ_GROUPS;
+}
+
+/* Parameter calculations */
+
+u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
+u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
+u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
+bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
- struct mlx5e_params *params);
-u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params);
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
+u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
- struct mlx5e_params *params);
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev,
- struct mlx5e_params *params);
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
- struct mlx5e_params *params);
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk);
+
+/* Build queue parameters */
+
+void mlx5e_build_rq_param(struct mlx5e_priv *priv,
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5e_rq_param *param);
+void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
+ struct mlx5e_sq_param *param);
+void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5e_cq_param *param);
+void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
+ struct mlx5e_params *params,
+ struct mlx5e_cq_param *param);
+void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
+ u8 log_wq_size,
+ struct mlx5e_cq_param *param);
+void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
+ u8 log_wq_size,
+ struct mlx5e_sq_param *param);
+void mlx5e_build_xdpsq_param(struct mlx5e_priv *priv,
+ struct mlx5e_params *params,
+ struct mlx5e_sq_param *param);
#endif /* __MLX5_EN_PARAMS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index eb8ef78e5626..b0b982cf69bb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -31,11 +31,13 @@
*/
#include <linux/bpf_trace.h>
+#include <net/xdp_sock.h>
#include "en/xdp.h"
+#include "en/params.h"
-int mlx5e_xdp_max_mtu(struct mlx5e_params *params)
+int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk)
{
- int hr = NET_IP_ALIGN + XDP_PACKET_HEADROOM;
+ int hr = mlx5e_get_linear_rq_headroom(params, xsk);
/* Let S := SKB_DATA_ALIGN(sizeof(struct skb_shared_info)).
* The condition checked in mlx5e_rx_is_linear_skb is:
@@ -54,25 +56,70 @@ int mlx5e_xdp_max_mtu(struct mlx5e_params *params)
}
static inline bool
-mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_dma_info *di,
- struct xdp_buff *xdp)
+mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *di, struct xdp_buff *xdp)
{
+ struct mlx5e_xdp_xmit_data xdptxd;
struct mlx5e_xdp_info xdpi;
+ struct xdp_frame *xdpf;
+ dma_addr_t dma_addr;
- xdpi.xdpf = convert_to_xdp_frame(xdp);
- if (unlikely(!xdpi.xdpf))
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
return false;
- xdpi.dma_addr = di->addr + (xdpi.xdpf->data - (void *)xdpi.xdpf);
- dma_sync_single_for_device(sq->pdev, xdpi.dma_addr,
- xdpi.xdpf->len, PCI_DMA_TODEVICE);
- xdpi.di = *di;
- return sq->xmit_xdp_frame(sq, &xdpi);
+ xdptxd.data = xdpf->data;
+ xdptxd.len = xdpf->len;
+
+ if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) {
+ /* The xdp_buff was in the UMEM and was copied into a newly
+ * allocated page. The UMEM page was returned via the ZCA, and
+ * this new page has to be mapped at this point and has to be
+ * unmapped and returned via xdp_return_frame on completion.
+ */
+
+ /* Prevent double recycling of the UMEM page. Even in case this
+ * function returns false, the xdp_buff shouldn't be recycled,
+ * as it was already done in xdp_convert_zc_to_xdp_frame.
+ */
+ __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
+
+ xdpi.mode = MLX5E_XDP_XMIT_MODE_FRAME;
+
+ dma_addr = dma_map_single(sq->pdev, xdptxd.data, xdptxd.len,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(sq->pdev, dma_addr)) {
+ xdp_return_frame(xdpf);
+ return false;
+ }
+
+ xdptxd.dma_addr = dma_addr;
+ xdpi.frame.xdpf = xdpf;
+ xdpi.frame.dma_addr = dma_addr;
+ } else {
+ /* Driver assumes that convert_to_xdp_frame returns an xdp_frame
+ * that points to the same memory region as the original
+ * xdp_buff. It allows to map the memory only once and to use
+ * the DMA_BIDIRECTIONAL mode.
+ */
+
+ xdpi.mode = MLX5E_XDP_XMIT_MODE_PAGE;
+
+ dma_addr = di->addr + (xdpf->data - (void *)xdpf);
+ dma_sync_single_for_device(sq->pdev, dma_addr, xdptxd.len,
+ DMA_TO_DEVICE);
+
+ xdptxd.dma_addr = dma_addr;
+ xdpi.page.rq = rq;
+ xdpi.page.di = *di;
+ }
+
+ return sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, 0);
}
/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
- void *va, u16 *rx_headroom, u32 *len)
+ void *va, u16 *rx_headroom, u32 *len, bool xsk)
{
struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
struct xdp_buff xdp;
@@ -86,16 +133,20 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + *len;
xdp.data_hard_start = va;
+ if (xsk)
+ xdp.handle = di->xsk.handle;
xdp.rxq = &rq->xdp_rxq;
act = bpf_prog_run_xdp(prog, &xdp);
+ if (xsk)
+ xdp.handle += xdp.data - xdp.data_hard_start;
switch (act) {
case XDP_PASS:
*rx_headroom = xdp.data - xdp.data_hard_start;
*len = xdp.data_end - xdp.data;
return false;
case XDP_TX:
- if (unlikely(!mlx5e_xmit_xdp_buff(&rq->xdpsq, di, &xdp)))
+ if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp)))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
return true;
@@ -106,7 +157,8 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
__set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
- mlx5e_page_dma_unmap(rq, di);
+ if (!xsk)
+ mlx5e_page_dma_unmap(rq, di);
rq->stats->xdp_redirect++;
return true;
default:
@@ -160,7 +212,7 @@ static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq)
stats->mpwqe++;
}
-static void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq)
+void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq)
{
struct mlx5_wq_cyc *wq = &sq->wq;
struct mlx5e_xdp_mpwqe *session = &sq->mpwqe;
@@ -183,32 +235,55 @@ static void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq)
session->wqe = NULL; /* Close session */
}
+enum {
+ MLX5E_XDP_CHECK_OK = 1,
+ MLX5E_XDP_CHECK_START_MPWQE = 2,
+};
+
+static int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq *sq)
+{
+ if (unlikely(!sq->mpwqe.wqe)) {
+ if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc,
+ MLX5_SEND_WQE_MAX_WQEBBS))) {
+ /* SQ is full, ring doorbell */
+ mlx5e_xmit_xdp_doorbell(sq);
+ sq->stats->full++;
+ return -EBUSY;
+ }
+
+ return MLX5E_XDP_CHECK_START_MPWQE;
+ }
+
+ return MLX5E_XDP_CHECK_OK;
+}
+
static bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq,
- struct mlx5e_xdp_info *xdpi)
+ struct mlx5e_xdp_xmit_data *xdptxd,
+ struct mlx5e_xdp_info *xdpi,
+ int check_result)
{
struct mlx5e_xdp_mpwqe *session = &sq->mpwqe;
struct mlx5e_xdpsq_stats *stats = sq->stats;
- struct xdp_frame *xdpf = xdpi->xdpf;
-
- if (unlikely(sq->hw_mtu < xdpf->len)) {
+ if (unlikely(xdptxd->len > sq->hw_mtu)) {
stats->err++;
return false;
}
- if (unlikely(!session->wqe)) {
- if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc,
- MLX5_SEND_WQE_MAX_WQEBBS))) {
- /* SQ is full, ring doorbell */
- mlx5e_xmit_xdp_doorbell(sq);
- stats->full++;
- return false;
- }
+ if (!check_result)
+ check_result = mlx5e_xmit_xdp_frame_check_mpwqe(sq);
+ if (unlikely(check_result < 0))
+ return false;
+ if (check_result == MLX5E_XDP_CHECK_START_MPWQE) {
+ /* Start the session when nothing can fail, so it's guaranteed
+ * that if there is an active session, it has at least one dseg,
+ * and it's safe to complete it at any time.
+ */
mlx5e_xdp_mpwqe_session_start(sq);
}
- mlx5e_xdp_mpwqe_add_dseg(sq, xdpi, stats);
+ mlx5e_xdp_mpwqe_add_dseg(sq, xdptxd, stats);
if (unlikely(session->complete ||
session->ds_count == session->max_ds_count))
@@ -219,7 +294,22 @@ static bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq,
return true;
}
-static bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi)
+static int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq)
+{
+ if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, 1))) {
+ /* SQ is full, ring doorbell */
+ mlx5e_xmit_xdp_doorbell(sq);
+ sq->stats->full++;
+ return -EBUSY;
+ }
+
+ return MLX5E_XDP_CHECK_OK;
+}
+
+static bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq,
+ struct mlx5e_xdp_xmit_data *xdptxd,
+ struct mlx5e_xdp_info *xdpi,
+ int check_result)
{
struct mlx5_wq_cyc *wq = &sq->wq;
u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
@@ -229,9 +319,8 @@ static bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *
struct mlx5_wqe_eth_seg *eseg = &wqe->eth;
struct mlx5_wqe_data_seg *dseg = wqe->data;
- struct xdp_frame *xdpf = xdpi->xdpf;
- dma_addr_t dma_addr = xdpi->dma_addr;
- unsigned int dma_len = xdpf->len;
+ dma_addr_t dma_addr = xdptxd->dma_addr;
+ u32 dma_len = xdptxd->len;
struct mlx5e_xdpsq_stats *stats = sq->stats;
@@ -242,18 +331,16 @@ static bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *
return false;
}
- if (unlikely(!mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1))) {
- /* SQ is full, ring doorbell */
- mlx5e_xmit_xdp_doorbell(sq);
- stats->full++;
+ if (!check_result)
+ check_result = mlx5e_xmit_xdp_frame_check(sq);
+ if (unlikely(check_result < 0))
return false;
- }
cseg->fm_ce_se = 0;
/* copy the inline part if required */
if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) {
- memcpy(eseg->inline_hdr.start, xdpf->data, MLX5E_XDP_MIN_INLINE);
+ memcpy(eseg->inline_hdr.start, xdptxd->data, MLX5E_XDP_MIN_INLINE);
eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
dma_len -= MLX5E_XDP_MIN_INLINE;
dma_addr += MLX5E_XDP_MIN_INLINE;
@@ -277,7 +364,7 @@ static bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *
static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
struct mlx5e_xdp_wqe_info *wi,
- struct mlx5e_rq *rq,
+ u32 *xsk_frames,
bool recycle)
{
struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo;
@@ -286,22 +373,32 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq,
for (i = 0; i < wi->num_pkts; i++) {
struct mlx5e_xdp_info xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo);
- if (rq) {
- /* XDP_TX */
- mlx5e_page_release(rq, &xdpi.di, recycle);
- } else {
- /* XDP_REDIRECT */
- dma_unmap_single(sq->pdev, xdpi.dma_addr,
- xdpi.xdpf->len, DMA_TO_DEVICE);
- xdp_return_frame(xdpi.xdpf);
+ switch (xdpi.mode) {
+ case MLX5E_XDP_XMIT_MODE_FRAME:
+ /* XDP_TX from the XSK RQ and XDP_REDIRECT */
+ dma_unmap_single(sq->pdev, xdpi.frame.dma_addr,
+ xdpi.frame.xdpf->len, DMA_TO_DEVICE);
+ xdp_return_frame(xdpi.frame.xdpf);
+ break;
+ case MLX5E_XDP_XMIT_MODE_PAGE:
+ /* XDP_TX from the regular RQ */
+ mlx5e_page_release_dynamic(xdpi.page.rq, &xdpi.page.di, recycle);
+ break;
+ case MLX5E_XDP_XMIT_MODE_XSK:
+ /* AF_XDP send */
+ (*xsk_frames)++;
+ break;
+ default:
+ WARN_ON_ONCE(true);
}
}
}
-bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
+bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq)
{
struct mlx5e_xdpsq *sq;
struct mlx5_cqe64 *cqe;
+ u32 xsk_frames = 0;
u16 sqcc;
int i;
@@ -343,10 +440,13 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
sqcc += wi->num_wqebbs;
- mlx5e_free_xdpsq_desc(sq, wi, rq, true);
+ mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, true);
} while (!last_wqe);
} while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq)));
+ if (xsk_frames)
+ xsk_umem_complete_tx(sq->umem, xsk_frames);
+
sq->stats->cqes += i;
mlx5_cqwq_update_db_record(&cq->wq);
@@ -358,8 +458,10 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
return (i == MLX5E_TX_CQ_POLL_BUDGET);
}
-void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq)
+void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq)
{
+ u32 xsk_frames = 0;
+
while (sq->cc != sq->pc) {
struct mlx5e_xdp_wqe_info *wi;
u16 ci;
@@ -369,8 +471,11 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq)
sq->cc += wi->num_wqebbs;
- mlx5e_free_xdpsq_desc(sq, wi, rq, false);
+ mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, false);
}
+
+ if (xsk_frames)
+ xsk_umem_complete_tx(sq->umem, xsk_frames);
}
int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
@@ -398,21 +503,27 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
+ struct mlx5e_xdp_xmit_data xdptxd;
struct mlx5e_xdp_info xdpi;
- xdpi.dma_addr = dma_map_single(sq->pdev, xdpf->data, xdpf->len,
- DMA_TO_DEVICE);
- if (unlikely(dma_mapping_error(sq->pdev, xdpi.dma_addr))) {
+ xdptxd.data = xdpf->data;
+ xdptxd.len = xdpf->len;
+ xdptxd.dma_addr = dma_map_single(sq->pdev, xdptxd.data,
+ xdptxd.len, DMA_TO_DEVICE);
+
+ if (unlikely(dma_mapping_error(sq->pdev, xdptxd.dma_addr))) {
xdp_return_frame_rx_napi(xdpf);
drops++;
continue;
}
- xdpi.xdpf = xdpf;
+ xdpi.mode = MLX5E_XDP_XMIT_MODE_FRAME;
+ xdpi.frame.xdpf = xdpf;
+ xdpi.frame.dma_addr = xdptxd.dma_addr;
- if (unlikely(!sq->xmit_xdp_frame(sq, &xdpi))) {
- dma_unmap_single(sq->pdev, xdpi.dma_addr,
- xdpf->len, DMA_TO_DEVICE);
+ if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, 0))) {
+ dma_unmap_single(sq->pdev, xdptxd.dma_addr,
+ xdptxd.len, DMA_TO_DEVICE);
xdp_return_frame_rx_napi(xdpf);
drops++;
}
@@ -429,7 +540,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq)
{
- struct mlx5e_xdpsq *xdpsq = &rq->xdpsq;
+ struct mlx5e_xdpsq *xdpsq = rq->xdpsq;
if (xdpsq->mpwqe.wqe)
mlx5e_xdp_mpwqe_complete(xdpsq);
@@ -444,6 +555,8 @@ void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq)
void mlx5e_set_xmit_fp(struct mlx5e_xdpsq *sq, bool is_mpw)
{
+ sq->xmit_xdp_frame_check = is_mpw ?
+ mlx5e_xmit_xdp_frame_check_mpwqe : mlx5e_xmit_xdp_frame_check;
sq->xmit_xdp_frame = is_mpw ?
mlx5e_xmit_xdp_frame_mpwqe : mlx5e_xmit_xdp_frame;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
index 8b537a4b0840..2d934c8d3807 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -39,11 +39,13 @@
(sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
#define MLX5E_XDP_TX_DS_COUNT (MLX5E_XDP_TX_EMPTY_DS_COUNT + 1 /* SG DS */)
-int mlx5e_xdp_max_mtu(struct mlx5e_params *params);
+struct mlx5e_xsk_param;
+int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
- void *va, u16 *rx_headroom, u32 *len);
-bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq);
-void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq);
+ void *va, u16 *rx_headroom, u32 *len, bool xsk);
+void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq);
+bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
+void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
void mlx5e_set_xmit_fp(struct mlx5e_xdpsq *sq, bool is_mpw);
void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq);
int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
@@ -66,6 +68,21 @@ static inline bool mlx5e_xdp_tx_is_enabled(struct mlx5e_priv *priv)
return test_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
}
+static inline void mlx5e_xdp_set_open(struct mlx5e_priv *priv)
+{
+ set_bit(MLX5E_STATE_XDP_OPEN, &priv->state);
+}
+
+static inline void mlx5e_xdp_set_closed(struct mlx5e_priv *priv)
+{
+ clear_bit(MLX5E_STATE_XDP_OPEN, &priv->state);
+}
+
+static inline bool mlx5e_xdp_is_open(struct mlx5e_priv *priv)
+{
+ return test_bit(MLX5E_STATE_XDP_OPEN, &priv->state);
+}
+
static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq)
{
if (sq->doorbell_cseg) {
@@ -97,15 +114,14 @@ static inline void mlx5e_xdp_update_inline_state(struct mlx5e_xdpsq *sq)
}
static inline void
-mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi,
+mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq,
+ struct mlx5e_xdp_xmit_data *xdptxd,
struct mlx5e_xdpsq_stats *stats)
{
struct mlx5e_xdp_mpwqe *session = &sq->mpwqe;
- dma_addr_t dma_addr = xdpi->dma_addr;
- struct xdp_frame *xdpf = xdpi->xdpf;
struct mlx5_wqe_data_seg *dseg =
(struct mlx5_wqe_data_seg *)session->wqe + session->ds_count;
- u16 dma_len = xdpf->len;
+ u32 dma_len = xdptxd->len;
session->pkt_count++;
@@ -124,7 +140,7 @@ mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi,
}
inline_dseg->byte_count = cpu_to_be32(dma_len | MLX5_INLINE_SEG);
- memcpy(inline_dseg->data, xdpf->data, dma_len);
+ memcpy(inline_dseg->data, xdptxd->data, dma_len);
session->ds_count += ds_cnt;
stats->inlnw++;
@@ -132,7 +148,7 @@ mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi,
}
no_inline:
- dseg->addr = cpu_to_be64(dma_addr);
+ dseg->addr = cpu_to_be64(xdptxd->dma_addr);
dseg->byte_count = cpu_to_be32(dma_len);
dseg->lkey = sq->mkey_be;
session->ds_count++;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/Makefile
new file mode 100644
index 000000000000..5ee42991900a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/Makefile
@@ -0,0 +1 @@
+subdir-ccflags-y += -I$(src)/../..
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
new file mode 100644
index 000000000000..6a55573ec8f2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include "rx.h"
+#include "en/xdp.h"
+#include <net/xdp_sock.h>
+
+/* RX data path */
+
+bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count)
+{
+ /* Check in advance that we have enough frames, instead of allocating
+ * one-by-one, failing and moving frames to the Reuse Ring.
+ */
+ return xsk_umem_has_addrs_rq(rq->umem, count);
+}
+
+int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info)
+{
+ struct xdp_umem *umem = rq->umem;
+ u64 handle;
+
+ if (!xsk_umem_peek_addr_rq(umem, &handle))
+ return -ENOMEM;
+
+ dma_info->xsk.handle = handle + rq->buff.umem_headroom;
+ dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
+
+ /* No need to add headroom to the DMA address. In striding RQ case, we
+ * just provide pages for UMR, and headroom is counted at the setup
+ * stage when creating a WQE. In non-striding RQ case, headroom is
+ * accounted in mlx5e_alloc_rx_wqe.
+ */
+ dma_info->addr = xdp_umem_get_dma(umem, handle);
+
+ xsk_umem_discard_addr_rq(umem);
+
+ dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+
+ return 0;
+}
+
+static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle)
+{
+ xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask);
+}
+
+/* XSKRQ uses pages from UMEM, they must not be released. They are returned to
+ * the userspace if possible, and if not, this function is called to reuse them
+ * in the driver.
+ */
+void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info)
+{
+ mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle);
+}
+
+/* Return a frame back to the hardware to fill in again. It is used by XDP when
+ * the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP.
+ */
+void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
+{
+ struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca);
+
+ mlx5e_xsk_recycle_frame(rq, handle);
+}
+
+static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
+ u32 cqe_bcnt)
+{
+ struct sk_buff *skb;
+
+ skb = napi_alloc_skb(rq->cq.napi, cqe_bcnt);
+ if (unlikely(!skb)) {
+ rq->stats->buff_alloc_err++;
+ return NULL;
+ }
+
+ skb_put_data(skb, data, cqe_bcnt);
+
+ return skb;
+}
+
+struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
+ struct mlx5e_mpw_info *wi,
+ u16 cqe_bcnt,
+ u32 head_offset,
+ u32 page_idx)
+{
+ struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
+ u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
+ u32 cqe_bcnt32 = cqe_bcnt;
+ void *va, *data;
+ u32 frag_size;
+ bool consumed;
+
+ /* Check packet size. Note LRO doesn't use linear SKB */
+ if (unlikely(cqe_bcnt > rq->hw_mtu)) {
+ rq->stats->oversize_pkts_sw_drop++;
+ return NULL;
+ }
+
+ /* head_offset is not used in this function, because di->xsk.data and
+ * di->addr point directly to the necessary place. Furthermore, in the
+ * current implementation, one page = one packet = one frame, so
+ * head_offset should always be 0.
+ */
+ WARN_ON_ONCE(head_offset);
+
+ va = di->xsk.data;
+ data = va + rx_headroom;
+ frag_size = rq->buff.headroom + cqe_bcnt32;
+
+ dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
+ prefetch(data);
+
+ rcu_read_lock();
+ consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true);
+ rcu_read_unlock();
+
+ /* Possible flows:
+ * - XDP_REDIRECT to XSKMAP:
+ * The page is owned by the userspace from now.
+ * - XDP_TX and other XDP_REDIRECTs:
+ * The page was returned by ZCA and recycled.
+ * - XDP_DROP:
+ * Recycle the page.
+ * - XDP_PASS:
+ * Allocate an SKB, copy the data and recycle the page.
+ *
+ * Pages to be recycled go to the Reuse Ring on MPWQE deallocation. Its
+ * size is the same as the Driver RX Ring's size, and pages for WQEs are
+ * allocated first from the Reuse Ring, so it has enough space.
+ */
+
+ if (likely(consumed)) {
+ if (likely(__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)))
+ __set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
+ return NULL; /* page/packet was consumed by XDP */
+ }
+
+ /* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
+ * frame. On SKB allocation failure, NULL is returned.
+ */
+ return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32);
+}
+
+struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
+ struct mlx5_cqe64 *cqe,
+ struct mlx5e_wqe_frag_info *wi,
+ u32 cqe_bcnt)
+{
+ struct mlx5e_dma_info *di = wi->di;
+ u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
+ void *va, *data;
+ bool consumed;
+ u32 frag_size;
+
+ /* wi->offset is not used in this function, because di->xsk.data and
+ * di->addr point directly to the necessary place. Furthermore, in the
+ * current implementation, one page = one packet = one frame, so
+ * wi->offset should always be 0.
+ */
+ WARN_ON_ONCE(wi->offset);
+
+ va = di->xsk.data;
+ data = va + rx_headroom;
+ frag_size = rq->buff.headroom + cqe_bcnt;
+
+ dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
+ prefetch(data);
+
+ if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
+ rq->stats->wqe_err++;
+ return NULL;
+ }
+
+ rcu_read_lock();
+ consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true);
+ rcu_read_unlock();
+
+ if (likely(consumed))
+ return NULL; /* page/packet was consumed by XDP */
+
+ /* XDP_PASS: copy the data from the UMEM to a new SKB. The frame reuse
+ * will be handled by mlx5e_put_rx_frag.
+ * On SKB allocation failure, NULL is returned.
+ */
+ return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
new file mode 100644
index 000000000000..307b923a1361
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_XSK_RX_H__
+#define __MLX5_EN_XSK_RX_H__
+
+#include "en.h"
+
+/* RX data path */
+
+bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count);
+int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info);
+void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info);
+void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
+struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
+ struct mlx5e_mpw_info *wi,
+ u16 cqe_bcnt,
+ u32 head_offset,
+ u32 page_idx);
+struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
+ struct mlx5_cqe64 *cqe,
+ struct mlx5e_wqe_frag_info *wi,
+ u32 cqe_bcnt);
+
+#endif /* __MLX5_EN_XSK_RX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
new file mode 100644
index 000000000000..aaffa6f68dc0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include "setup.h"
+#include "en/params.h"
+
+bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5_core_dev *mdev)
+{
+ /* AF_XDP doesn't support frames larger than PAGE_SIZE, and the current
+ * mlx5e XDP implementation doesn't support multiple packets per page.
+ */
+ if (xsk->chunk_size != PAGE_SIZE)
+ return false;
+
+ /* Current MTU and XSK headroom don't allow packets to fit the frames. */
+ if (mlx5e_rx_get_linear_frag_sz(params, xsk) > xsk->chunk_size)
+ return false;
+
+ /* frag_sz is different for regular and XSK RQs, so ensure that linear
+ * SKB mode is possible.
+ */
+ switch (params->rq_wq_type) {
+ case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+ return mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk);
+ default: /* MLX5_WQ_TYPE_CYCLIC */
+ return mlx5e_rx_is_linear_skb(params, xsk);
+ }
+}
+
+static void mlx5e_build_xskicosq_param(struct mlx5e_priv *priv,
+ u8 log_wq_size,
+ struct mlx5e_sq_param *param)
+{
+ void *sqc = param->sqc;
+ void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+ mlx5e_build_sq_param_common(priv, param);
+
+ MLX5_SET(wq, wq, log_wq_sz, log_wq_size);
+}
+
+static void mlx5e_build_xsk_cparam(struct mlx5e_priv *priv,
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5e_channel_param *cparam)
+{
+ const u8 xskicosq_size = MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+
+ mlx5e_build_rq_param(priv, params, xsk, &cparam->rq);
+ mlx5e_build_xdpsq_param(priv, params, &cparam->xdp_sq);
+ mlx5e_build_xskicosq_param(priv, xskicosq_size, &cparam->icosq);
+ mlx5e_build_rx_cq_param(priv, params, xsk, &cparam->rx_cq);
+ mlx5e_build_tx_cq_param(priv, params, &cparam->tx_cq);
+ mlx5e_build_ico_cq_param(priv, xskicosq_size, &cparam->icosq_cq);
+}
+
+int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk, struct xdp_umem *umem,
+ struct mlx5e_channel *c)
+{
+ struct mlx5e_channel_param cparam = {};
+ struct dim_cq_moder icocq_moder = {};
+ int err;
+
+ if (!mlx5e_validate_xsk_param(params, xsk, priv->mdev))
+ return -EINVAL;
+
+ mlx5e_build_xsk_cparam(priv, params, xsk, &cparam);
+
+ err = mlx5e_open_cq(c, params->rx_cq_moderation, &cparam.rx_cq, &c->xskrq.cq);
+ if (unlikely(err))
+ return err;
+
+ err = mlx5e_open_rq(c, params, &cparam.rq, xsk, umem, &c->xskrq);
+ if (unlikely(err))
+ goto err_close_rx_cq;
+
+ err = mlx5e_open_cq(c, params->tx_cq_moderation, &cparam.tx_cq, &c->xsksq.cq);
+ if (unlikely(err))
+ goto err_close_rq;
+
+ /* Create a separate SQ, so that when the UMEM is disabled, we could
+ * close this SQ safely and stop receiving CQEs. In other case, e.g., if
+ * the XDPSQ was used instead, we might run into trouble when the UMEM
+ * is disabled and then reenabled, but the SQ continues receiving CQEs
+ * from the old UMEM.
+ */
+ err = mlx5e_open_xdpsq(c, params, &cparam.xdp_sq, umem, &c->xsksq, true);
+ if (unlikely(err))
+ goto err_close_tx_cq;
+
+ err = mlx5e_open_cq(c, icocq_moder, &cparam.icosq_cq, &c->xskicosq.cq);
+ if (unlikely(err))
+ goto err_close_sq;
+
+ /* Create a dedicated SQ for posting NOPs whenever we need an IRQ to be
+ * triggered and NAPI to be called on the correct CPU.
+ */
+ err = mlx5e_open_icosq(c, params, &cparam.icosq, &c->xskicosq);
+ if (unlikely(err))
+ goto err_close_icocq;
+
+ spin_lock_init(&c->xskicosq_lock);
+
+ set_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
+
+ return 0;
+
+err_close_icocq:
+ mlx5e_close_cq(&c->xskicosq.cq);
+
+err_close_sq:
+ mlx5e_close_xdpsq(&c->xsksq);
+
+err_close_tx_cq:
+ mlx5e_close_cq(&c->xsksq.cq);
+
+err_close_rq:
+ mlx5e_close_rq(&c->xskrq);
+
+err_close_rx_cq:
+ mlx5e_close_cq(&c->xskrq.cq);
+
+ return err;
+}
+
+void mlx5e_close_xsk(struct mlx5e_channel *c)
+{
+ clear_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
+ napi_synchronize(&c->napi);
+
+ mlx5e_close_rq(&c->xskrq);
+ mlx5e_close_cq(&c->xskrq.cq);
+ mlx5e_close_icosq(&c->xskicosq);
+ mlx5e_close_cq(&c->xskicosq.cq);
+ mlx5e_close_xdpsq(&c->xsksq);
+ mlx5e_close_cq(&c->xsksq.cq);
+}
+
+void mlx5e_activate_xsk(struct mlx5e_channel *c)
+{
+ set_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state);
+ /* TX queue is created active. */
+ mlx5e_trigger_irq(&c->xskicosq);
+}
+
+void mlx5e_deactivate_xsk(struct mlx5e_channel *c)
+{
+ mlx5e_deactivate_rq(&c->xskrq);
+ /* TX queue is disabled on close. */
+}
+
+static int mlx5e_redirect_xsk_rqt(struct mlx5e_priv *priv, u16 ix, u32 rqn)
+{
+ struct mlx5e_redirect_rqt_param direct_rrp = {
+ .is_rss = false,
+ {
+ .rqn = rqn,
+ },
+ };
+
+ u32 rqtn = priv->xsk_tir[ix].rqt.rqtn;
+
+ return mlx5e_redirect_rqt(priv, rqtn, 1, direct_rrp);
+}
+
+int mlx5e_xsk_redirect_rqt_to_channel(struct mlx5e_priv *priv, struct mlx5e_channel *c)
+{
+ return mlx5e_redirect_xsk_rqt(priv, c->ix, c->xskrq.rqn);
+}
+
+int mlx5e_xsk_redirect_rqt_to_drop(struct mlx5e_priv *priv, u16 ix)
+{
+ return mlx5e_redirect_xsk_rqt(priv, ix, priv->drop_rq.rqn);
+}
+
+int mlx5e_xsk_redirect_rqts_to_channels(struct mlx5e_priv *priv, struct mlx5e_channels *chs)
+{
+ int err, i;
+
+ if (!priv->xsk.refcnt)
+ return 0;
+
+ for (i = 0; i < chs->num; i++) {
+ struct mlx5e_channel *c = chs->c[i];
+
+ if (!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
+ continue;
+
+ err = mlx5e_xsk_redirect_rqt_to_channel(priv, c);
+ if (unlikely(err))
+ goto err_stop;
+ }
+
+ return 0;
+
+err_stop:
+ for (i--; i >= 0; i--) {
+ if (!test_bit(MLX5E_CHANNEL_STATE_XSK, chs->c[i]->state))
+ continue;
+
+ mlx5e_xsk_redirect_rqt_to_drop(priv, i);
+ }
+
+ return err;
+}
+
+void mlx5e_xsk_redirect_rqts_to_drop(struct mlx5e_priv *priv, struct mlx5e_channels *chs)
+{
+ int i;
+
+ if (!priv->xsk.refcnt)
+ return;
+
+ for (i = 0; i < chs->num; i++) {
+ if (!test_bit(MLX5E_CHANNEL_STATE_XSK, chs->c[i]->state))
+ continue;
+
+ mlx5e_xsk_redirect_rqt_to_drop(priv, i);
+ }
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h
new file mode 100644
index 000000000000..0dd11b81c046
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_XSK_SETUP_H__
+#define __MLX5_EN_XSK_SETUP_H__
+
+#include "en.h"
+
+struct mlx5e_xsk_param;
+
+bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5_core_dev *mdev);
+int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk, struct xdp_umem *umem,
+ struct mlx5e_channel *c);
+void mlx5e_close_xsk(struct mlx5e_channel *c);
+void mlx5e_activate_xsk(struct mlx5e_channel *c);
+void mlx5e_deactivate_xsk(struct mlx5e_channel *c);
+int mlx5e_xsk_redirect_rqt_to_channel(struct mlx5e_priv *priv, struct mlx5e_channel *c);
+int mlx5e_xsk_redirect_rqt_to_drop(struct mlx5e_priv *priv, u16 ix);
+int mlx5e_xsk_redirect_rqts_to_channels(struct mlx5e_priv *priv, struct mlx5e_channels *chs);
+void mlx5e_xsk_redirect_rqts_to_drop(struct mlx5e_priv *priv, struct mlx5e_channels *chs);
+
+#endif /* __MLX5_EN_XSK_SETUP_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
new file mode 100644
index 000000000000..35e188cf4ea4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include "tx.h"
+#include "umem.h"
+#include "en/xdp.h"
+#include "en/params.h"
+#include <net/xdp_sock.h>
+
+int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5e_params *params = &priv->channels.params;
+ struct mlx5e_channel *c;
+ u16 ix;
+
+ if (unlikely(!mlx5e_xdp_is_open(priv)))
+ return -ENETDOWN;
+
+ if (unlikely(!mlx5e_qid_get_ch_if_in_group(params, qid, MLX5E_RQ_GROUP_XSK, &ix)))
+ return -EINVAL;
+
+ c = priv->channels.c[ix];
+
+ if (unlikely(!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)))
+ return -ENXIO;
+
+ if (!napi_if_scheduled_mark_missed(&c->napi)) {
+ spin_lock(&c->xskicosq_lock);
+ mlx5e_trigger_irq(&c->xskicosq);
+ spin_unlock(&c->xskicosq_lock);
+ }
+
+ return 0;
+}
+
+/* When TX fails (because of the size of the packet), we need to get completions
+ * in order, so post a NOP to get a CQE. Since AF_XDP doesn't distinguish
+ * between successful TX and errors, handling in mlx5e_poll_xdpsq_cq is the
+ * same.
+ */
+static void mlx5e_xsk_tx_post_err(struct mlx5e_xdpsq *sq,
+ struct mlx5e_xdp_info *xdpi)
+{
+ u16 pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc);
+ struct mlx5e_xdp_wqe_info *wi = &sq->db.wqe_info[pi];
+ struct mlx5e_tx_wqe *nopwqe;
+
+ wi->num_wqebbs = 1;
+ wi->num_pkts = 1;
+
+ nopwqe = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc);
+ mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi);
+ sq->doorbell_cseg = &nopwqe->ctrl;
+}
+
+bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
+{
+ struct xdp_umem *umem = sq->umem;
+ struct mlx5e_xdp_info xdpi;
+ struct mlx5e_xdp_xmit_data xdptxd;
+ bool work_done = true;
+ bool flush = false;
+
+ xdpi.mode = MLX5E_XDP_XMIT_MODE_XSK;
+
+ for (; budget; budget--) {
+ int check_result = sq->xmit_xdp_frame_check(sq);
+ struct xdp_desc desc;
+
+ if (unlikely(check_result < 0)) {
+ work_done = false;
+ break;
+ }
+
+ if (!xsk_umem_consume_tx(umem, &desc)) {
+ /* TX will get stuck until something wakes it up by
+ * triggering NAPI. Currently it's expected that the
+ * application calls sendto() if there are consumed, but
+ * not completed frames.
+ */
+ break;
+ }
+
+ xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr);
+ xdptxd.data = xdp_umem_get_data(umem, desc.addr);
+ xdptxd.len = desc.len;
+
+ dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr,
+ xdptxd.len, DMA_BIDIRECTIONAL);
+
+ if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, check_result))) {
+ if (sq->mpwqe.wqe)
+ mlx5e_xdp_mpwqe_complete(sq);
+
+ mlx5e_xsk_tx_post_err(sq, &xdpi);
+ }
+
+ flush = true;
+ }
+
+ if (flush) {
+ if (sq->mpwqe.wqe)
+ mlx5e_xdp_mpwqe_complete(sq);
+ mlx5e_xmit_xdp_doorbell(sq);
+
+ xsk_umem_consume_tx_done(umem);
+ }
+
+ return !(budget && work_done);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
new file mode 100644
index 000000000000..7add18bf78d8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_XSK_TX_H__
+#define __MLX5_EN_XSK_TX_H__
+
+#include "en.h"
+
+/* TX data path */
+
+int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid);
+
+bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget);
+
+#endif /* __MLX5_EN_XSK_TX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
new file mode 100644
index 000000000000..4baaa5788320
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include <net/xdp_sock.h>
+#include "umem.h"
+#include "setup.h"
+#include "en/params.h"
+
+static int mlx5e_xsk_map_umem(struct mlx5e_priv *priv,
+ struct xdp_umem *umem)
+{
+ struct device *dev = priv->mdev->device;
+ u32 i;
+
+ for (i = 0; i < umem->npgs; i++) {
+ dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+
+ if (unlikely(dma_mapping_error(dev, dma)))
+ goto err_unmap;
+ umem->pages[i].dma = dma;
+ }
+
+ return 0;
+
+err_unmap:
+ while (i--) {
+ dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+ umem->pages[i].dma = 0;
+ }
+
+ return -ENOMEM;
+}
+
+static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv,
+ struct xdp_umem *umem)
+{
+ struct device *dev = priv->mdev->device;
+ u32 i;
+
+ for (i = 0; i < umem->npgs; i++) {
+ dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+ umem->pages[i].dma = 0;
+ }
+}
+
+static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk)
+{
+ if (!xsk->umems) {
+ xsk->umems = kcalloc(MLX5E_MAX_NUM_CHANNELS,
+ sizeof(*xsk->umems), GFP_KERNEL);
+ if (unlikely(!xsk->umems))
+ return -ENOMEM;
+ }
+
+ xsk->refcnt++;
+ xsk->ever_used = true;
+
+ return 0;
+}
+
+static void mlx5e_xsk_put_umems(struct mlx5e_xsk *xsk)
+{
+ if (!--xsk->refcnt) {
+ kfree(xsk->umems);
+ xsk->umems = NULL;
+ }
+}
+
+static int mlx5e_xsk_add_umem(struct mlx5e_xsk *xsk, struct xdp_umem *umem, u16 ix)
+{
+ int err;
+
+ err = mlx5e_xsk_get_umems(xsk);
+ if (unlikely(err))
+ return err;
+
+ xsk->umems[ix] = umem;
+ return 0;
+}
+
+static void mlx5e_xsk_remove_umem(struct mlx5e_xsk *xsk, u16 ix)
+{
+ xsk->umems[ix] = NULL;
+
+ mlx5e_xsk_put_umems(xsk);
+}
+
+static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem)
+{
+ return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff;
+}
+
+void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk)
+{
+ xsk->headroom = umem->headroom;
+ xsk->chunk_size = umem->chunk_size_nohr + umem->headroom;
+}
+
+static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
+ struct xdp_umem *umem, u16 ix)
+{
+ struct mlx5e_params *params = &priv->channels.params;
+ struct mlx5e_xsk_param xsk;
+ struct mlx5e_channel *c;
+ int err;
+
+ if (unlikely(mlx5e_xsk_get_umem(&priv->channels.params, &priv->xsk, ix)))
+ return -EBUSY;
+
+ if (unlikely(!mlx5e_xsk_is_umem_sane(umem)))
+ return -EINVAL;
+
+ err = mlx5e_xsk_map_umem(priv, umem);
+ if (unlikely(err))
+ return err;
+
+ err = mlx5e_xsk_add_umem(&priv->xsk, umem, ix);
+ if (unlikely(err))
+ goto err_unmap_umem;
+
+ mlx5e_build_xsk_param(umem, &xsk);
+
+ if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+ /* XSK objects will be created on open. */
+ goto validate_closed;
+ }
+
+ if (!params->xdp_prog) {
+ /* XSK objects will be created when an XDP program is set,
+ * and the channels are reopened.
+ */
+ goto validate_closed;
+ }
+
+ c = priv->channels.c[ix];
+
+ err = mlx5e_open_xsk(priv, params, &xsk, umem, c);
+ if (unlikely(err))
+ goto err_remove_umem;
+
+ mlx5e_activate_xsk(c);
+
+ /* Don't wait for WQEs, because the newer xdpsock sample doesn't provide
+ * any Fill Ring entries at the setup stage.
+ */
+
+ err = mlx5e_xsk_redirect_rqt_to_channel(priv, priv->channels.c[ix]);
+ if (unlikely(err))
+ goto err_deactivate;
+
+ return 0;
+
+err_deactivate:
+ mlx5e_deactivate_xsk(c);
+ mlx5e_close_xsk(c);
+
+err_remove_umem:
+ mlx5e_xsk_remove_umem(&priv->xsk, ix);
+
+err_unmap_umem:
+ mlx5e_xsk_unmap_umem(priv, umem);
+
+ return err;
+
+validate_closed:
+ /* Check the configuration in advance, rather than fail at a later stage
+ * (in mlx5e_xdp_set or on open) and end up with no channels.
+ */
+ if (!mlx5e_validate_xsk_param(params, &xsk, priv->mdev)) {
+ err = -EINVAL;
+ goto err_remove_umem;
+ }
+
+ return 0;
+}
+
+static int mlx5e_xsk_disable_locked(struct mlx5e_priv *priv, u16 ix)
+{
+ struct xdp_umem *umem = mlx5e_xsk_get_umem(&priv->channels.params,
+ &priv->xsk, ix);
+ struct mlx5e_channel *c;
+
+ if (unlikely(!umem))
+ return -EINVAL;
+
+ if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+ goto remove_umem;
+
+ /* XSK RQ and SQ are only created if XDP program is set. */
+ if (!priv->channels.params.xdp_prog)
+ goto remove_umem;
+
+ c = priv->channels.c[ix];
+ mlx5e_xsk_redirect_rqt_to_drop(priv, ix);
+ mlx5e_deactivate_xsk(c);
+ mlx5e_close_xsk(c);
+
+remove_umem:
+ mlx5e_xsk_remove_umem(&priv->xsk, ix);
+ mlx5e_xsk_unmap_umem(priv, umem);
+
+ return 0;
+}
+
+static int mlx5e_xsk_enable_umem(struct mlx5e_priv *priv, struct xdp_umem *umem,
+ u16 ix)
+{
+ int err;
+
+ mutex_lock(&priv->state_lock);
+ err = mlx5e_xsk_enable_locked(priv, umem, ix);
+ mutex_unlock(&priv->state_lock);
+
+ return err;
+}
+
+static int mlx5e_xsk_disable_umem(struct mlx5e_priv *priv, u16 ix)
+{
+ int err;
+
+ mutex_lock(&priv->state_lock);
+ err = mlx5e_xsk_disable_locked(priv, ix);
+ mutex_unlock(&priv->state_lock);
+
+ return err;
+}
+
+int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5e_params *params = &priv->channels.params;
+ u16 ix;
+
+ if (unlikely(!mlx5e_qid_get_ch_if_in_group(params, qid, MLX5E_RQ_GROUP_XSK, &ix)))
+ return -EINVAL;
+
+ return umem ? mlx5e_xsk_enable_umem(priv, umem, ix) :
+ mlx5e_xsk_disable_umem(priv, ix);
+}
+
+int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries)
+{
+ struct xdp_umem_fq_reuse *reuseq;
+
+ reuseq = xsk_reuseq_prepare(nentries);
+ if (unlikely(!reuseq))
+ return -ENOMEM;
+ xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
+
+ return 0;
+}
+
+u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk)
+{
+ u16 res = xsk->refcnt ? params->num_channels : 0;
+
+ while (res) {
+ if (mlx5e_xsk_get_umem(params, xsk, res - 1))
+ break;
+ --res;
+ }
+
+ return res;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.h
new file mode 100644
index 000000000000..25b4cbe58b54
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/umem.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_XSK_UMEM_H__
+#define __MLX5_EN_XSK_UMEM_H__
+
+#include "en.h"
+
+static inline struct xdp_umem *mlx5e_xsk_get_umem(struct mlx5e_params *params,
+ struct mlx5e_xsk *xsk, u16 ix)
+{
+ if (!xsk || !xsk->umems)
+ return NULL;
+
+ if (unlikely(ix >= params->num_channels))
+ return NULL;
+
+ return xsk->umems[ix];
+}
+
+struct mlx5e_xsk_param;
+void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk);
+
+/* .ndo_bpf callback. */
+int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid);
+
+int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries);
+
+u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk);
+
+#endif /* __MLX5_EN_XSK_UMEM_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 198a52d1e515..126ec4181286 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -32,6 +32,7 @@
#include "en.h"
#include "en/port.h"
+#include "en/xsk/umem.h"
#include "lib/clock.h"
void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
@@ -388,8 +389,17 @@ static int mlx5e_set_ringparam(struct net_device *dev,
void mlx5e_ethtool_get_channels(struct mlx5e_priv *priv,
struct ethtool_channels *ch)
{
+ mutex_lock(&priv->state_lock);
+
ch->max_combined = mlx5e_get_netdev_max_channels(priv->netdev);
ch->combined_count = priv->channels.params.num_channels;
+ if (priv->xsk.refcnt) {
+ /* The upper half are XSK queues. */
+ ch->max_combined *= 2;
+ ch->combined_count *= 2;
+ }
+
+ mutex_unlock(&priv->state_lock);
}
static void mlx5e_get_channels(struct net_device *dev,
@@ -403,6 +413,7 @@ static void mlx5e_get_channels(struct net_device *dev,
int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
struct ethtool_channels *ch)
{
+ struct mlx5e_params *cur_params = &priv->channels.params;
unsigned int count = ch->combined_count;
struct mlx5e_channels new_channels = {};
bool arfs_enabled;
@@ -414,16 +425,26 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
return -EINVAL;
}
- if (priv->channels.params.num_channels == count)
+ if (cur_params->num_channels == count)
return 0;
mutex_lock(&priv->state_lock);
+ /* Don't allow changing the number of channels if there is an active
+ * XSK, because the numeration of the XSK and regular RQs will change.
+ */
+ if (priv->xsk.refcnt) {
+ err = -EINVAL;
+ netdev_err(priv->netdev, "%s: AF_XDP is active, cannot change the number of channels\n",
+ __func__);
+ goto out;
+ }
+
new_channels.params = priv->channels.params;
new_channels.params.num_channels = count;
if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
- priv->channels.params = new_channels.params;
+ *cur_params = new_channels.params;
if (!netif_is_rxfh_configured(priv->netdev))
mlx5e_build_default_indir_rqt(priv->rss_params.indirection_rqt,
MLX5E_INDIR_RQT_SIZE, count);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 839662644ed3..ea3a490b569a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -32,6 +32,8 @@
#include <linux/mlx5/fs.h>
#include "en.h"
+#include "en/params.h"
+#include "en/xsk/umem.h"
struct mlx5e_ethtool_rule {
struct list_head list;
@@ -414,6 +416,14 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv,
if (fs->ring_cookie == RX_CLS_FLOW_DISC) {
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
} else {
+ struct mlx5e_params *params = &priv->channels.params;
+ enum mlx5e_rq_group group;
+ struct mlx5e_tir *tir;
+ u16 ix;
+
+ mlx5e_qid_get_ch_and_group(params, fs->ring_cookie, &ix, &group);
+ tir = group == MLX5E_RQ_GROUP_XSK ? priv->xsk_tir : priv->direct_tir;
+
dst = kzalloc(sizeof(*dst), GFP_KERNEL);
if (!dst) {
err = -ENOMEM;
@@ -421,7 +431,7 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv,
}
dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
- dst->tir_num = priv->direct_tir[fs->ring_cookie].tirn;
+ dst->tir_num = tir[ix].tirn;
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
}
@@ -600,9 +610,9 @@ static int validate_flow(struct mlx5e_priv *priv,
if (fs->location >= MAX_NUM_OF_ETHTOOL_RULES)
return -ENOSPC;
- if (fs->ring_cookie >= priv->channels.params.num_channels &&
- fs->ring_cookie != RX_CLS_FLOW_DISC)
- return -EINVAL;
+ if (fs->ring_cookie != RX_CLS_FLOW_DISC)
+ if (!mlx5e_qid_validate(&priv->channels.params, fs->ring_cookie))
+ return -EINVAL;
switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
case ETHER_FLOW:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1085040675ae..2f9093ba82aa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -38,6 +38,7 @@
#include <linux/bpf.h>
#include <linux/if_bridge.h>
#include <net/page_pool.h>
+#include <net/xdp_sock.h>
#include "eswitch.h"
#include "en.h"
#include "en_tc.h"
@@ -56,35 +57,10 @@
#include "en/monitor_stats.h"
#include "en/reporter.h"
#include "en/params.h"
-
-struct mlx5e_rq_param {
- u32 rqc[MLX5_ST_SZ_DW(rqc)];
- struct mlx5_wq_param wq;
- struct mlx5e_rq_frags_info frags_info;
-};
-
-struct mlx5e_sq_param {
- u32 sqc[MLX5_ST_SZ_DW(sqc)];
- struct mlx5_wq_param wq;
- bool is_mpw;
-};
-
-struct mlx5e_cq_param {
- u32 cqc[MLX5_ST_SZ_DW(cqc)];
- struct mlx5_wq_param wq;
- u16 eq_ix;
- u8 cq_period_mode;
-};
-
-struct mlx5e_channel_param {
- struct mlx5e_rq_param rq;
- struct mlx5e_sq_param sq;
- struct mlx5e_sq_param xdp_sq;
- struct mlx5e_sq_param icosq;
- struct mlx5e_cq_param rx_cq;
- struct mlx5e_cq_param tx_cq;
- struct mlx5e_cq_param icosq_cq;
-};
+#include "en/xsk/umem.h"
+#include "en/xsk/setup.h"
+#include "en/xsk/rx.h"
+#include "en/xsk/tx.h"
bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
{
@@ -114,18 +90,31 @@ void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d)\n",
params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ,
params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ?
- BIT(mlx5e_mpwqe_get_log_rq_size(params)) :
+ BIT(mlx5e_mpwqe_get_log_rq_size(params, NULL)) :
BIT(params->log_rq_mtu_frames),
- BIT(mlx5e_mpwqe_get_log_stride_size(mdev, params)),
+ BIT(mlx5e_mpwqe_get_log_stride_size(mdev, params, NULL)),
MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
}
bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev,
struct mlx5e_params *params)
{
- return mlx5e_check_fragmented_striding_rq_cap(mdev) &&
- !MLX5_IPSEC_DEV(mdev) &&
- !(params->xdp_prog && !mlx5e_rx_mpwqe_is_linear_skb(mdev, params));
+ if (!mlx5e_check_fragmented_striding_rq_cap(mdev))
+ return false;
+
+ if (MLX5_IPSEC_DEV(mdev))
+ return false;
+
+ if (params->xdp_prog) {
+ /* XSK params are not considered here. If striding RQ is in use,
+ * and an XSK is being opened, mlx5e_rx_mpwqe_is_linear_skb will
+ * be called with the known XSK params.
+ */
+ if (!mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL))
+ return false;
+ }
+
+ return true;
}
void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
@@ -394,6 +383,8 @@ static void mlx5e_free_di_list(struct mlx5e_rq *rq)
static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct xdp_umem *umem,
struct mlx5e_rq_param *rqp,
struct mlx5e_rq *rq)
{
@@ -401,6 +392,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5_core_dev *mdev = c->mdev;
void *rqc = rqp->rqc;
void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
+ u32 num_xsk_frames = 0;
+ u32 rq_xdp_ix;
u32 pool_size;
int wq_sz;
int err;
@@ -417,7 +410,13 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->ix = c->ix;
rq->mdev = mdev;
rq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
- rq->stats = &c->priv->channel_stats[c->ix].rq;
+ rq->xdpsq = &c->rq_xdpsq;
+ rq->umem = umem;
+
+ if (rq->umem)
+ rq->stats = &c->priv->channel_stats[c->ix].xskrq;
+ else
+ rq->stats = &c->priv->channel_stats[c->ix].rq;
rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
if (IS_ERR(rq->xdp_prog)) {
@@ -426,12 +425,16 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
goto err_rq_wq_destroy;
}
- err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix);
+ rq_xdp_ix = rq->ix;
+ if (xsk)
+ rq_xdp_ix += params->num_channels * MLX5E_RQ_GROUP_XSK;
+ err = xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq_xdp_ix);
if (err < 0)
goto err_rq_wq_destroy;
rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
- rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
+ rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk);
+ rq->buff.umem_headroom = xsk ? xsk->headroom : 0;
pool_size = 1 << params->log_rq_mtu_frames;
switch (rq->wq_type) {
@@ -445,7 +448,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
- pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
+ if (xsk)
+ num_xsk_frames = wq_sz <<
+ mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk);
+
+ pool_size = MLX5_MPWRQ_PAGES_PER_WQE <<
+ mlx5e_mpwqe_get_log_rq_size(params, xsk);
rq->post_wqes = mlx5e_post_rx_mpwqes;
rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
@@ -464,12 +472,15 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
goto err_rq_wq_destroy;
}
- rq->mpwqe.skb_from_cqe_mpwrq =
- mlx5e_rx_mpwqe_is_linear_skb(mdev, params) ?
- mlx5e_skb_from_cqe_mpwrq_linear :
- mlx5e_skb_from_cqe_mpwrq_nonlinear;
- rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params);
- rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params));
+ rq->mpwqe.skb_from_cqe_mpwrq = xsk ?
+ mlx5e_xsk_skb_from_cqe_mpwrq_linear :
+ mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL) ?
+ mlx5e_skb_from_cqe_mpwrq_linear :
+ mlx5e_skb_from_cqe_mpwrq_nonlinear;
+
+ rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk);
+ rq->mpwqe.num_strides =
+ BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk));
err = mlx5e_create_rq_umr_mkey(mdev, rq);
if (err)
@@ -490,6 +501,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
+ if (xsk)
+ num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags;
+
rq->wqe.info = rqp->frags_info;
rq->wqe.frags =
kvzalloc_node(array_size(sizeof(*rq->wqe.frags),
@@ -503,6 +517,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
err = mlx5e_init_di_list(rq, wq_sz, c->cpu);
if (err)
goto err_free;
+
rq->post_wqes = mlx5e_post_rx_wqes;
rq->dealloc_wqe = mlx5e_dealloc_rx_wqe;
@@ -518,37 +533,53 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
goto err_free;
}
- rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(params) ?
- mlx5e_skb_from_cqe_linear :
- mlx5e_skb_from_cqe_nonlinear;
+ rq->wqe.skb_from_cqe = xsk ?
+ mlx5e_xsk_skb_from_cqe_linear :
+ mlx5e_rx_is_linear_skb(params, NULL) ?
+ mlx5e_skb_from_cqe_linear :
+ mlx5e_skb_from_cqe_nonlinear;
rq->mkey_be = c->mkey_be;
}
- /* Create a page_pool and register it with rxq */
- pp_params.order = 0;
- pp_params.flags = 0; /* No-internal DMA mapping in page_pool */
- pp_params.pool_size = pool_size;
- pp_params.nid = cpu_to_node(c->cpu);
- pp_params.dev = c->pdev;
- pp_params.dma_dir = rq->buff.map_dir;
-
- /* page_pool can be used even when there is no rq->xdp_prog,
- * given page_pool does not handle DMA mapping there is no
- * required state to clear. And page_pool gracefully handle
- * elevated refcnt.
- */
- rq->page_pool = page_pool_create(&pp_params);
- if (IS_ERR(rq->page_pool)) {
- err = PTR_ERR(rq->page_pool);
- rq->page_pool = NULL;
- goto err_free;
+ if (xsk) {
+ err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames);
+ if (unlikely(err)) {
+ mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n",
+ num_xsk_frames);
+ goto err_free;
+ }
+
+ rq->zca.free = mlx5e_xsk_zca_free;
+ err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+ MEM_TYPE_ZERO_COPY,
+ &rq->zca);
+ } else {
+ /* Create a page_pool and register it with rxq */
+ pp_params.order = 0;
+ pp_params.flags = 0; /* No-internal DMA mapping in page_pool */
+ pp_params.pool_size = pool_size;
+ pp_params.nid = cpu_to_node(c->cpu);
+ pp_params.dev = c->pdev;
+ pp_params.dma_dir = rq->buff.map_dir;
+
+ /* page_pool can be used even when there is no rq->xdp_prog,
+ * given page_pool does not handle DMA mapping there is no
+ * required state to clear. And page_pool gracefully handle
+ * elevated refcnt.
+ */
+ rq->page_pool = page_pool_create(&pp_params);
+ if (IS_ERR(rq->page_pool)) {
+ err = PTR_ERR(rq->page_pool);
+ rq->page_pool = NULL;
+ goto err_free;
+ }
+ err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+ MEM_TYPE_PAGE_POOL, rq->page_pool);
+ if (err)
+ page_pool_free(rq->page_pool);
}
- err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
- MEM_TYPE_PAGE_POOL, rq->page_pool);
- if (err) {
- page_pool_free(rq->page_pool);
+ if (err)
goto err_free;
- }
for (i = 0; i < wq_sz; i++) {
if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
@@ -639,7 +670,11 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
i = (i + 1) & (MLX5E_CACHE_SIZE - 1)) {
struct mlx5e_dma_info *dma_info = &rq->page_cache.page_cache[i];
- mlx5e_page_release(rq, dma_info, false);
+ /* With AF_XDP, page_cache is not used, so this loop is not
+ * entered, and it's safe to call mlx5e_page_release_dynamic
+ * directly.
+ */
+ mlx5e_page_release_dynamic(rq, dma_info, false);
}
xdp_rxq_info_unreg(&rq->xdp_rxq);
@@ -776,7 +811,7 @@ static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
mlx5_core_destroy_rq(rq->mdev, rq->rqn);
}
-static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
+int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
{
unsigned long exp_time = jiffies + msecs_to_jiffies(wait_time);
struct mlx5e_channel *c = rq->channel;
@@ -834,14 +869,13 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
}
-static int mlx5e_open_rq(struct mlx5e_channel *c,
- struct mlx5e_params *params,
- struct mlx5e_rq_param *param,
- struct mlx5e_rq *rq)
+int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_params *params,
+ struct mlx5e_rq_param *param, struct mlx5e_xsk_param *xsk,
+ struct xdp_umem *umem, struct mlx5e_rq *rq)
{
int err;
- err = mlx5e_alloc_rq(c, params, param, rq);
+ err = mlx5e_alloc_rq(c, params, xsk, umem, param, rq);
if (err)
return err;
@@ -879,13 +913,13 @@ static void mlx5e_activate_rq(struct mlx5e_rq *rq)
mlx5e_trigger_irq(&rq->channel->icosq);
}
-static void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
+void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
{
clear_bit(MLX5E_RQ_STATE_ENABLED, &rq->state);
napi_synchronize(&rq->channel->napi); /* prevent mlx5e_post_rx_wqes */
}
-static void mlx5e_close_rq(struct mlx5e_rq *rq)
+void mlx5e_close_rq(struct mlx5e_rq *rq)
{
cancel_work_sync(&rq->dim.work);
mlx5e_destroy_rq(rq);
@@ -938,6 +972,7 @@ static int mlx5e_alloc_xdpsq_db(struct mlx5e_xdpsq *sq, int numa)
static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
struct mlx5e_params *params,
+ struct xdp_umem *umem,
struct mlx5e_sq_param *param,
struct mlx5e_xdpsq *sq,
bool is_redirect)
@@ -953,9 +988,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
sq->uar_map = mdev->mlx5e_res.bfreg.map;
sq->min_inline_mode = params->tx_min_inline_mode;
sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
- sq->stats = is_redirect ?
- &c->priv->channel_stats[c->ix].xdpsq :
- &c->priv->channel_stats[c->ix].rq_xdpsq;
+ sq->umem = umem;
+
+ sq->stats = sq->umem ?
+ &c->priv->channel_stats[c->ix].xsksq :
+ is_redirect ?
+ &c->priv->channel_stats[c->ix].xdpsq :
+ &c->priv->channel_stats[c->ix].rq_xdpsq;
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
@@ -1335,10 +1374,8 @@ static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work)
mlx5e_tx_reporter_err_cqe(sq);
}
-static int mlx5e_open_icosq(struct mlx5e_channel *c,
- struct mlx5e_params *params,
- struct mlx5e_sq_param *param,
- struct mlx5e_icosq *sq)
+int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
+ struct mlx5e_sq_param *param, struct mlx5e_icosq *sq)
{
struct mlx5e_create_sq_param csp = {};
int err;
@@ -1364,7 +1401,7 @@ err_free_icosq:
return err;
}
-static void mlx5e_close_icosq(struct mlx5e_icosq *sq)
+void mlx5e_close_icosq(struct mlx5e_icosq *sq)
{
struct mlx5e_channel *c = sq->channel;
@@ -1375,16 +1412,14 @@ static void mlx5e_close_icosq(struct mlx5e_icosq *sq)
mlx5e_free_icosq(sq);
}
-static int mlx5e_open_xdpsq(struct mlx5e_channel *c,
- struct mlx5e_params *params,
- struct mlx5e_sq_param *param,
- struct mlx5e_xdpsq *sq,
- bool is_redirect)
+int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params,
+ struct mlx5e_sq_param *param, struct xdp_umem *umem,
+ struct mlx5e_xdpsq *sq, bool is_redirect)
{
struct mlx5e_create_sq_param csp = {};
int err;
- err = mlx5e_alloc_xdpsq(c, params, param, sq, is_redirect);
+ err = mlx5e_alloc_xdpsq(c, params, umem, param, sq, is_redirect);
if (err)
return err;
@@ -1438,7 +1473,7 @@ err_free_xdpsq:
return err;
}
-static void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq)
+void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq)
{
struct mlx5e_channel *c = sq->channel;
@@ -1446,7 +1481,7 @@ static void mlx5e_close_xdpsq(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq)
napi_synchronize(&c->napi);
mlx5e_destroy_sq(c->mdev, sq->sqn);
- mlx5e_free_xdpsq_descs(sq, rq);
+ mlx5e_free_xdpsq_descs(sq);
mlx5e_free_xdpsq(sq);
}
@@ -1567,10 +1602,8 @@ static void mlx5e_destroy_cq(struct mlx5e_cq *cq)
mlx5_core_destroy_cq(cq->mdev, &cq->mcq);
}
-static int mlx5e_open_cq(struct mlx5e_channel *c,
- struct dim_cq_moder moder,
- struct mlx5e_cq_param *param,
- struct mlx5e_cq *cq)
+int mlx5e_open_cq(struct mlx5e_channel *c, struct dim_cq_moder moder,
+ struct mlx5e_cq_param *param, struct mlx5e_cq *cq)
{
struct mlx5_core_dev *mdev = c->mdev;
int err;
@@ -1593,7 +1626,7 @@ err_free_cq:
return err;
}
-static void mlx5e_close_cq(struct mlx5e_cq *cq)
+void mlx5e_close_cq(struct mlx5e_cq *cq)
{
mlx5e_destroy_cq(cq);
mlx5e_free_cq(cq);
@@ -1767,49 +1800,16 @@ static void mlx5e_free_xps_cpumask(struct mlx5e_channel *c)
free_cpumask_var(c->xps_cpumask);
}
-static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
- struct mlx5e_params *params,
- struct mlx5e_channel_param *cparam,
- struct mlx5e_channel **cp)
+static int mlx5e_open_queues(struct mlx5e_channel *c,
+ struct mlx5e_params *params,
+ struct mlx5e_channel_param *cparam)
{
- int cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(priv->mdev, ix));
struct dim_cq_moder icocq_moder = {0, 0};
- struct net_device *netdev = priv->netdev;
- struct mlx5e_channel *c;
- unsigned int irq;
int err;
- int eqn;
-
- err = mlx5_vector2eqn(priv->mdev, ix, &eqn, &irq);
- if (err)
- return err;
-
- c = kvzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
- if (!c)
- return -ENOMEM;
-
- c->priv = priv;
- c->mdev = priv->mdev;
- c->tstamp = &priv->tstamp;
- c->ix = ix;
- c->cpu = cpu;
- c->pdev = priv->mdev->device;
- c->netdev = priv->netdev;
- c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
- c->num_tc = params->num_tc;
- c->xdp = !!params->xdp_prog;
- c->stats = &priv->channel_stats[ix].ch;
- c->irq_desc = irq_to_desc(irq);
-
- err = mlx5e_alloc_xps_cpumask(c, params);
- if (err)
- goto err_free_channel;
-
- netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
err = mlx5e_open_cq(c, icocq_moder, &cparam->icosq_cq, &c->icosq.cq);
if (err)
- goto err_napi_del;
+ return err;
err = mlx5e_open_tx_cqs(c, params, cparam);
if (err)
@@ -1825,7 +1825,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
/* XDP SQ CQ params are same as normal TXQ sq CQ params */
err = c->xdp ? mlx5e_open_cq(c, params->tx_cq_moderation,
- &cparam->tx_cq, &c->rq.xdpsq.cq) : 0;
+ &cparam->tx_cq, &c->rq_xdpsq.cq) : 0;
if (err)
goto err_close_rx_cq;
@@ -1839,20 +1839,21 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
if (err)
goto err_close_icosq;
- err = c->xdp ? mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, &c->rq.xdpsq, false) : 0;
- if (err)
- goto err_close_sqs;
+ if (c->xdp) {
+ err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, NULL,
+ &c->rq_xdpsq, false);
+ if (err)
+ goto err_close_sqs;
+ }
- err = mlx5e_open_rq(c, params, &cparam->rq, &c->rq);
+ err = mlx5e_open_rq(c, params, &cparam->rq, NULL, NULL, &c->rq);
if (err)
goto err_close_xdp_sq;
- err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, &c->xdpsq, true);
+ err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, NULL, &c->xdpsq, true);
if (err)
goto err_close_rq;
- *cp = c;
-
return 0;
err_close_rq:
@@ -1860,7 +1861,7 @@ err_close_rq:
err_close_xdp_sq:
if (c->xdp)
- mlx5e_close_xdpsq(&c->rq.xdpsq, &c->rq);
+ mlx5e_close_xdpsq(&c->rq_xdpsq);
err_close_sqs:
mlx5e_close_sqs(c);
@@ -1870,8 +1871,9 @@ err_close_icosq:
err_disable_napi:
napi_disable(&c->napi);
+
if (c->xdp)
- mlx5e_close_cq(&c->rq.xdpsq.cq);
+ mlx5e_close_cq(&c->rq_xdpsq.cq);
err_close_rx_cq:
mlx5e_close_cq(&c->rq.cq);
@@ -1885,6 +1887,85 @@ err_close_tx_cqs:
err_close_icosq_cq:
mlx5e_close_cq(&c->icosq.cq);
+ return err;
+}
+
+static void mlx5e_close_queues(struct mlx5e_channel *c)
+{
+ mlx5e_close_xdpsq(&c->xdpsq);
+ mlx5e_close_rq(&c->rq);
+ if (c->xdp)
+ mlx5e_close_xdpsq(&c->rq_xdpsq);
+ mlx5e_close_sqs(c);
+ mlx5e_close_icosq(&c->icosq);
+ napi_disable(&c->napi);
+ if (c->xdp)
+ mlx5e_close_cq(&c->rq_xdpsq.cq);
+ mlx5e_close_cq(&c->rq.cq);
+ mlx5e_close_cq(&c->xdpsq.cq);
+ mlx5e_close_tx_cqs(c);
+ mlx5e_close_cq(&c->icosq.cq);
+}
+
+static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
+ struct mlx5e_params *params,
+ struct mlx5e_channel_param *cparam,
+ struct xdp_umem *umem,
+ struct mlx5e_channel **cp)
+{
+ int cpu = cpumask_first(mlx5_comp_irq_get_affinity_mask(priv->mdev, ix));
+ struct net_device *netdev = priv->netdev;
+ struct mlx5e_xsk_param xsk;
+ struct mlx5e_channel *c;
+ unsigned int irq;
+ int err;
+ int eqn;
+
+ err = mlx5_vector2eqn(priv->mdev, ix, &eqn, &irq);
+ if (err)
+ return err;
+
+ c = kvzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
+ if (!c)
+ return -ENOMEM;
+
+ c->priv = priv;
+ c->mdev = priv->mdev;
+ c->tstamp = &priv->tstamp;
+ c->ix = ix;
+ c->cpu = cpu;
+ c->pdev = priv->mdev->device;
+ c->netdev = priv->netdev;
+ c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+ c->num_tc = params->num_tc;
+ c->xdp = !!params->xdp_prog;
+ c->stats = &priv->channel_stats[ix].ch;
+ c->irq_desc = irq_to_desc(irq);
+
+ err = mlx5e_alloc_xps_cpumask(c, params);
+ if (err)
+ goto err_free_channel;
+
+ netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
+
+ err = mlx5e_open_queues(c, params, cparam);
+ if (unlikely(err))
+ goto err_napi_del;
+
+ if (umem) {
+ mlx5e_build_xsk_param(umem, &xsk);
+ err = mlx5e_open_xsk(priv, params, &xsk, umem, c);
+ if (unlikely(err))
+ goto err_close_queues;
+ }
+
+ *cp = c;
+
+ return 0;
+
+err_close_queues:
+ mlx5e_close_queues(c);
+
err_napi_del:
netif_napi_del(&c->napi);
mlx5e_free_xps_cpumask(c);
@@ -1903,12 +1984,18 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
mlx5e_activate_txqsq(&c->sq[tc]);
mlx5e_activate_rq(&c->rq);
netif_set_xps_queue(c->netdev, c->xps_cpumask, c->ix);
+
+ if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
+ mlx5e_activate_xsk(c);
}
static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
{
int tc;
+ if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
+ mlx5e_deactivate_xsk(c);
+
mlx5e_deactivate_rq(&c->rq);
for (tc = 0; tc < c->num_tc; tc++)
mlx5e_deactivate_txqsq(&c->sq[tc]);
@@ -1916,19 +2003,9 @@ static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
static void mlx5e_close_channel(struct mlx5e_channel *c)
{
- mlx5e_close_xdpsq(&c->xdpsq, NULL);
- mlx5e_close_rq(&c->rq);
- if (c->xdp)
- mlx5e_close_xdpsq(&c->rq.xdpsq, &c->rq);
- mlx5e_close_sqs(c);
- mlx5e_close_icosq(&c->icosq);
- napi_disable(&c->napi);
- if (c->xdp)
- mlx5e_close_cq(&c->rq.xdpsq.cq);
- mlx5e_close_cq(&c->rq.cq);
- mlx5e_close_cq(&c->xdpsq.cq);
- mlx5e_close_tx_cqs(c);
- mlx5e_close_cq(&c->icosq.cq);
+ if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
+ mlx5e_close_xsk(c);
+ mlx5e_close_queues(c);
netif_napi_del(&c->napi);
mlx5e_free_xps_cpumask(c);
@@ -1939,6 +2016,7 @@ static void mlx5e_close_channel(struct mlx5e_channel *c)
static void mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
struct mlx5e_rq_frags_info *info)
{
u32 byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
@@ -1951,10 +2029,10 @@ static void mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
byte_count += MLX5E_METADATA_ETHER_LEN;
#endif
- if (mlx5e_rx_is_linear_skb(params)) {
+ if (mlx5e_rx_is_linear_skb(params, xsk)) {
int frag_stride;
- frag_stride = mlx5e_rx_get_linear_frag_sz(params);
+ frag_stride = mlx5e_rx_get_linear_frag_sz(params, xsk);
frag_stride = roundup_pow_of_two(frag_stride);
info->arr[0].frag_size = byte_count;
@@ -2012,9 +2090,10 @@ static u8 mlx5e_get_rq_log_wq_sz(void *rqc)
return MLX5_GET(wq, wq, log_wq_sz);
}
-static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
- struct mlx5e_params *params,
- struct mlx5e_rq_param *param)
+void mlx5e_build_rq_param(struct mlx5e_priv *priv,
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5e_rq_param *param)
{
struct mlx5_core_dev *mdev = priv->mdev;
void *rqc = param->rqc;
@@ -2024,16 +2103,16 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
switch (params->rq_wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
MLX5_SET(wq, wq, log_wqe_num_of_strides,
- mlx5e_mpwqe_get_log_num_strides(mdev, params) -
+ mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk) -
MLX5_MPWQE_LOG_NUM_STRIDES_BASE);
MLX5_SET(wq, wq, log_wqe_stride_size,
- mlx5e_mpwqe_get_log_stride_size(mdev, params) -
+ mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk) -
MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
- MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params));
+ MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params, xsk));
break;
default: /* MLX5_WQ_TYPE_CYCLIC */
MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
- mlx5e_build_rq_frags_info(mdev, params, &param->frags_info);
+ mlx5e_build_rq_frags_info(mdev, params, xsk, &param->frags_info);
ndsegs = param->frags_info.num_frags;
}
@@ -2064,8 +2143,8 @@ static void mlx5e_build_drop_rq_param(struct mlx5e_priv *priv,
param->wq.buf_numa_node = dev_to_node(mdev->device);
}
-static void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
- struct mlx5e_sq_param *param)
+void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
+ struct mlx5e_sq_param *param)
{
void *sqc = param->sqc;
void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
@@ -2101,9 +2180,10 @@ static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD);
}
-static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
- struct mlx5e_params *params,
- struct mlx5e_cq_param *param)
+void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
+ struct mlx5e_params *params,
+ struct mlx5e_xsk_param *xsk,
+ struct mlx5e_cq_param *param)
{
struct mlx5_core_dev *mdev = priv->mdev;
void *cqc = param->cqc;
@@ -2111,8 +2191,8 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
switch (params->rq_wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
- log_cq_size = mlx5e_mpwqe_get_log_rq_size(params) +
- mlx5e_mpwqe_get_log_num_strides(mdev, params);
+ log_cq_size = mlx5e_mpwqe_get_log_rq_size(params, xsk) +
+ mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk);
break;
default: /* MLX5_WQ_TYPE_CYCLIC */
log_cq_size = params->log_rq_mtu_frames;
@@ -2128,9 +2208,9 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
param->cq_period_mode = params->rx_cq_moderation.cq_period_mode;
}
-static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
- struct mlx5e_params *params,
- struct mlx5e_cq_param *param)
+void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
+ struct mlx5e_params *params,
+ struct mlx5e_cq_param *param)
{
void *cqc = param->cqc;
@@ -2140,9 +2220,9 @@ static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
param->cq_period_mode = params->tx_cq_moderation.cq_period_mode;
}
-static void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
- u8 log_wq_size,
- struct mlx5e_cq_param *param)
+void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
+ u8 log_wq_size,
+ struct mlx5e_cq_param *param)
{
void *cqc = param->cqc;
@@ -2153,9 +2233,9 @@ static void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
param->cq_period_mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
}
-static void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
- u8 log_wq_size,
- struct mlx5e_sq_param *param)
+void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
+ u8 log_wq_size,
+ struct mlx5e_sq_param *param)
{
void *sqc = param->sqc;
void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
@@ -2166,9 +2246,9 @@ static void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
MLX5_SET(sqc, sqc, reg_umr, MLX5_CAP_ETH(priv->mdev, reg_umr_sq));
}
-static void mlx5e_build_xdpsq_param(struct mlx5e_priv *priv,
- struct mlx5e_params *params,
- struct mlx5e_sq_param *param)
+void mlx5e_build_xdpsq_param(struct mlx5e_priv *priv,
+ struct mlx5e_params *params,
+ struct mlx5e_sq_param *param)
{
void *sqc = param->sqc;
void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
@@ -2196,14 +2276,14 @@ static void mlx5e_build_channel_param(struct mlx5e_priv *priv,
{
u8 icosq_log_wq_sz;
- mlx5e_build_rq_param(priv, params, &cparam->rq);
+ mlx5e_build_rq_param(priv, params, NULL, &cparam->rq);
icosq_log_wq_sz = mlx5e_build_icosq_log_wq_sz(params, &cparam->rq);
mlx5e_build_sq_param(priv, params, &cparam->sq);
mlx5e_build_xdpsq_param(priv, params, &cparam->xdp_sq);
mlx5e_build_icosq_param(priv, icosq_log_wq_sz, &cparam->icosq);
- mlx5e_build_rx_cq_param(priv, params, &cparam->rx_cq);
+ mlx5e_build_rx_cq_param(priv, params, NULL, &cparam->rx_cq);
mlx5e_build_tx_cq_param(priv, params, &cparam->tx_cq);
mlx5e_build_ico_cq_param(priv, icosq_log_wq_sz, &cparam->icosq_cq);
}
@@ -2224,7 +2304,12 @@ int mlx5e_open_channels(struct mlx5e_priv *priv,
mlx5e_build_channel_param(priv, &chs->params, cparam);
for (i = 0; i < chs->num; i++) {
- err = mlx5e_open_channel(priv, i, &chs->params, cparam, &chs->c[i]);
+ struct xdp_umem *umem = NULL;
+
+ if (chs->params.xdp_prog)
+ umem = mlx5e_xsk_get_umem(&chs->params, chs->params.xsk, i);
+
+ err = mlx5e_open_channel(priv, i, &chs->params, cparam, umem, &chs->c[i]);
if (err)
goto err_close_channels;
}
@@ -2266,6 +2351,10 @@ static int mlx5e_wait_channels_min_rx_wqes(struct mlx5e_channels *chs)
int timeout = err ? 0 : MLX5E_RQ_WQES_TIMEOUT;
err |= mlx5e_wait_for_min_rx_wqes(&chs->c[i]->rq, timeout);
+
+ /* Don't wait on the XSK RQ, because the newer xdpsock sample
+ * doesn't provide any Fill Ring entries at the setup stage.
+ */
}
return err ? -ETIMEDOUT : 0;
@@ -2338,35 +2427,35 @@ int mlx5e_create_indirect_rqt(struct mlx5e_priv *priv)
return err;
}
-int mlx5e_create_direct_rqts(struct mlx5e_priv *priv)
+int mlx5e_create_direct_rqts(struct mlx5e_priv *priv, struct mlx5e_tir *tirs)
{
- struct mlx5e_rqt *rqt;
+ const int max_nch = mlx5e_get_netdev_max_channels(priv->netdev);
int err;
int ix;
- for (ix = 0; ix < mlx5e_get_netdev_max_channels(priv->netdev); ix++) {
- rqt = &priv->direct_tir[ix].rqt;
- err = mlx5e_create_rqt(priv, 1 /*size */, rqt);
- if (err)
+ for (ix = 0; ix < max_nch; ix++) {
+ err = mlx5e_create_rqt(priv, 1 /*size */, &tirs[ix].rqt);
+ if (unlikely(err))
goto err_destroy_rqts;
}
return 0;
err_destroy_rqts:
- mlx5_core_warn(priv->mdev, "create direct rqts failed, %d\n", err);
+ mlx5_core_warn(priv->mdev, "create rqts failed, %d\n", err);
for (ix--; ix >= 0; ix--)
- mlx5e_destroy_rqt(priv, &priv->direct_tir[ix].rqt);
+ mlx5e_destroy_rqt(priv, &tirs[ix].rqt);
return err;
}
-void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv)
+void mlx5e_destroy_direct_rqts(struct mlx5e_priv *priv, struct mlx5e_tir *tirs)
{
+ const int max_nch = mlx5e_get_netdev_max_channels(priv->netdev);
int i;
- for (i = 0; i < mlx5e_get_netdev_max_channels(priv->netdev); i++)
- mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
+ for (i = 0; i < max_nch; i++)
+ mlx5e_destroy_rqt(priv, &tirs[i].rqt);
}
static int mlx5e_rx_hash_fn(int hfunc)
@@ -2786,11 +2875,12 @@ static void mlx5e_build_tx2sq_maps(struct mlx5e_priv *priv)
void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
{
int num_txqs = priv->channels.num * priv->channels.params.num_tc;
+ int num_rxqs = priv->channels.num * MLX5E_NUM_RQ_GROUPS;
struct net_device *netdev = priv->netdev;
mlx5e_netdev_set_tcs(netdev);
netif_set_real_num_tx_queues(netdev, num_txqs);
- netif_set_real_num_rx_queues(netdev, priv->channels.num);
+ netif_set_real_num_rx_queues(netdev, num_rxqs);
mlx5e_build_tx2sq_maps(priv);
mlx5e_activate_channels(&priv->channels);
@@ -2802,10 +2892,14 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
mlx5e_wait_channels_min_rx_wqes(&priv->channels);
mlx5e_redirect_rqts_to_channels(priv, &priv->channels);
+
+ mlx5e_xsk_redirect_rqts_to_channels(priv, &priv->channels);
}
void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv)
{
+ mlx5e_xsk_redirect_rqts_to_drop(priv, &priv->channels);
+
mlx5e_redirect_rqts_to_drop(priv);
if (mlx5e_is_vport_rep(priv))
@@ -2884,9 +2978,12 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv)
int mlx5e_open_locked(struct net_device *netdev)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
+ bool is_xdp = priv->channels.params.xdp_prog;
int err;
set_bit(MLX5E_STATE_OPENED, &priv->state);
+ if (is_xdp)
+ mlx5e_xdp_set_open(priv);
err = mlx5e_open_channels(priv, &priv->channels);
if (err)
@@ -2901,6 +2998,8 @@ int mlx5e_open_locked(struct net_device *netdev)
return 0;
err_clear_state_opened_flag:
+ if (is_xdp)
+ mlx5e_xdp_set_closed(priv);
clear_bit(MLX5E_STATE_OPENED, &priv->state);
return err;
}
@@ -2932,6 +3031,8 @@ int mlx5e_close_locked(struct net_device *netdev)
if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
return 0;
+ if (priv->channels.params.xdp_prog)
+ mlx5e_xdp_set_closed(priv);
clear_bit(MLX5E_STATE_OPENED, &priv->state);
netif_carrier_off(priv->netdev);
@@ -3188,13 +3289,13 @@ err_destroy_inner_tirs:
return err;
}
-int mlx5e_create_direct_tirs(struct mlx5e_priv *priv)
+int mlx5e_create_direct_tirs(struct mlx5e_priv *priv, struct mlx5e_tir *tirs)
{
- int nch = mlx5e_get_netdev_max_channels(priv->netdev);
+ const int max_nch = mlx5e_get_netdev_max_channels(priv->netdev);
struct mlx5e_tir *tir;
void *tirc;
int inlen;
- int err;
+ int err = 0;
u32 *in;
int ix;
@@ -3203,25 +3304,24 @@ int mlx5e_create_direct_tirs(struct mlx5e_priv *priv)
if (!in)
return -ENOMEM;
- for (ix = 0; ix < nch; ix++) {
+ for (ix = 0; ix < max_nch; ix++) {
memset(in, 0, inlen);
- tir = &priv->direct_tir[ix];
+ tir = &tirs[ix];
tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
- mlx5e_build_direct_tir_ctx(priv, priv->direct_tir[ix].rqt.rqtn, tirc);
+ mlx5e_build_direct_tir_ctx(priv, tir->rqt.rqtn, tirc);
err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
- if (err)
+ if (unlikely(err))
goto err_destroy_ch_tirs;
}
- kvfree(in);
-
- return 0;
+ goto out;
err_destroy_ch_tirs:
- mlx5_core_warn(priv->mdev, "create direct tirs failed, %d\n", err);
+ mlx5_core_warn(priv->mdev, "create tirs failed, %d\n", err);
for (ix--; ix >= 0; ix--)
- mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[ix]);
+ mlx5e_destroy_tir(priv->mdev, &tirs[ix]);
+out:
kvfree(in);
return err;
@@ -3241,13 +3341,13 @@ void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv, bool inner_ttc)
mlx5e_destroy_tir(priv->mdev, &priv->inner_indir_tir[i]);
}
-void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv)
+void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv, struct mlx5e_tir *tirs)
{
- int nch = mlx5e_get_netdev_max_channels(priv->netdev);
+ const int max_nch = mlx5e_get_netdev_max_channels(priv->netdev);
int i;
- for (i = 0; i < nch; i++)
- mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[i]);
+ for (i = 0; i < max_nch; i++)
+ mlx5e_destroy_tir(priv->mdev, &tirs[i]);
}
static int mlx5e_modify_channels_scatter_fcs(struct mlx5e_channels *chs, bool enable)
@@ -3389,11 +3489,12 @@ void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s)
for (i = 0; i < mlx5e_get_netdev_max_channels(priv->netdev); i++) {
struct mlx5e_channel_stats *channel_stats = &priv->channel_stats[i];
+ struct mlx5e_rq_stats *xskrq_stats = &channel_stats->xskrq;
struct mlx5e_rq_stats *rq_stats = &channel_stats->rq;
int j;
- s->rx_packets += rq_stats->packets;
- s->rx_bytes += rq_stats->bytes;
+ s->rx_packets += rq_stats->packets + xskrq_stats->packets;
+ s->rx_bytes += rq_stats->bytes + xskrq_stats->bytes;
for (j = 0; j < priv->max_opened_tc; j++) {
struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
@@ -3492,6 +3593,13 @@ static int set_feature_lro(struct net_device *netdev, bool enable)
mutex_lock(&priv->state_lock);
+ if (enable && priv->xsk.refcnt) {
+ netdev_warn(netdev, "LRO is incompatible with AF_XDP (%hu XSKs are active)\n",
+ priv->xsk.refcnt);
+ err = -EINVAL;
+ goto out;
+ }
+
old_params = &priv->channels.params;
if (enable && !MLX5E_GET_PFLAG(old_params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
netdev_warn(netdev, "can't set LRO with legacy RQ\n");
@@ -3505,8 +3613,8 @@ static int set_feature_lro(struct net_device *netdev, bool enable)
new_channels.params.lro_en = enable;
if (old_params->rq_wq_type != MLX5_WQ_TYPE_CYCLIC) {
- if (mlx5e_rx_mpwqe_is_linear_skb(mdev, old_params) ==
- mlx5e_rx_mpwqe_is_linear_skb(mdev, &new_channels.params))
+ if (mlx5e_rx_mpwqe_is_linear_skb(mdev, old_params, NULL) ==
+ mlx5e_rx_mpwqe_is_linear_skb(mdev, &new_channels.params, NULL))
reset = false;
}
@@ -3696,6 +3804,43 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
return features;
}
+static bool mlx5e_xsk_validate_mtu(struct net_device *netdev,
+ struct mlx5e_channels *chs,
+ struct mlx5e_params *new_params,
+ struct mlx5_core_dev *mdev)
+{
+ u16 ix;
+
+ for (ix = 0; ix < chs->params.num_channels; ix++) {
+ struct xdp_umem *umem = mlx5e_xsk_get_umem(&chs->params, chs->params.xsk, ix);
+ struct mlx5e_xsk_param xsk;
+
+ if (!umem)
+ continue;
+
+ mlx5e_build_xsk_param(umem, &xsk);
+
+ if (!mlx5e_validate_xsk_param(new_params, &xsk, mdev)) {
+ u32 hr = mlx5e_get_linear_rq_headroom(new_params, &xsk);
+ int max_mtu_frame, max_mtu_page, max_mtu;
+
+ /* Two criteria must be met:
+ * 1. HW MTU + all headrooms <= XSK frame size.
+ * 2. Size of SKBs allocated on XDP_PASS <= PAGE_SIZE.
+ */
+ max_mtu_frame = MLX5E_HW2SW_MTU(new_params, xsk.chunk_size - hr);
+ max_mtu_page = mlx5e_xdp_max_mtu(new_params, &xsk);
+ max_mtu = min(max_mtu_frame, max_mtu_page);
+
+ netdev_err(netdev, "MTU %d is too big for an XSK running on channel %hu. Try MTU <= %d\n",
+ new_params->sw_mtu, ix, max_mtu);
+ return false;
+ }
+ }
+
+ return true;
+}
+
int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
change_hw_mtu_cb set_mtu_cb)
{
@@ -3716,18 +3861,31 @@ int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
new_channels.params.sw_mtu = new_mtu;
if (params->xdp_prog &&
- !mlx5e_rx_is_linear_skb(&new_channels.params)) {
+ !mlx5e_rx_is_linear_skb(&new_channels.params, NULL)) {
netdev_err(netdev, "MTU(%d) > %d is not allowed while XDP enabled\n",
- new_mtu, mlx5e_xdp_max_mtu(params));
+ new_mtu, mlx5e_xdp_max_mtu(params, NULL));
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (priv->xsk.refcnt &&
+ !mlx5e_xsk_validate_mtu(netdev, &priv->channels,
+ &new_channels.params, priv->mdev)) {
err = -EINVAL;
goto out;
}
if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
- bool is_linear = mlx5e_rx_mpwqe_is_linear_skb(priv->mdev, &new_channels.params);
- u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params);
- u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_channels.params);
+ bool is_linear = mlx5e_rx_mpwqe_is_linear_skb(priv->mdev,
+ &new_channels.params,
+ NULL);
+ u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params, NULL);
+ u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_channels.params, NULL);
+
+ /* If XSK is active, XSK RQs are linear. */
+ is_linear |= priv->xsk.refcnt;
+ /* Always reset in linear mode - hw_mtu is used in data path. */
reset = reset && (is_linear || (ppw_old != ppw_new));
}
@@ -4160,16 +4318,29 @@ static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog)
new_channels.params = priv->channels.params;
new_channels.params.xdp_prog = prog;
- if (!mlx5e_rx_is_linear_skb(&new_channels.params)) {
+ /* No XSK params: AF_XDP can't be enabled yet at the point of setting
+ * the XDP program.
+ */
+ if (!mlx5e_rx_is_linear_skb(&new_channels.params, NULL)) {
netdev_warn(netdev, "XDP is not allowed with MTU(%d) > %d\n",
new_channels.params.sw_mtu,
- mlx5e_xdp_max_mtu(&new_channels.params));
+ mlx5e_xdp_max_mtu(&new_channels.params, NULL));
return -EINVAL;
}
return 0;
}
+static int mlx5e_xdp_update_state(struct mlx5e_priv *priv)
+{
+ if (priv->channels.params.xdp_prog)
+ mlx5e_xdp_set_open(priv);
+ else
+ mlx5e_xdp_set_closed(priv);
+
+ return 0;
+}
+
static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -4190,8 +4361,6 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
/* no need for full reset when exchanging programs */
reset = (!priv->channels.params.xdp_prog || !prog);
- if (was_opened && reset)
- mlx5e_close_locked(netdev);
if (was_opened && !reset) {
/* num_channels is invariant here, so we can take the
* batched reference right upfront.
@@ -4203,20 +4372,31 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
}
}
- /* exchange programs, extra prog reference we got from caller
- * as long as we don't fail from this point onwards.
- */
- old_prog = xchg(&priv->channels.params.xdp_prog, prog);
+ if (was_opened && reset) {
+ struct mlx5e_channels new_channels = {};
+
+ new_channels.params = priv->channels.params;
+ new_channels.params.xdp_prog = prog;
+ mlx5e_set_rq_type(priv->mdev, &new_channels.params);
+ old_prog = priv->channels.params.xdp_prog;
+
+ err = mlx5e_safe_switch_channels(priv, &new_channels, mlx5e_xdp_update_state);
+ if (err)
+ goto unlock;
+ } else {
+ /* exchange programs, extra prog reference we got from caller
+ * as long as we don't fail from this point onwards.
+ */
+ old_prog = xchg(&priv->channels.params.xdp_prog, prog);
+ }
+
if (old_prog)
bpf_prog_put(old_prog);
- if (reset) /* change RQ type according to priv->xdp_prog */
+ if (!was_opened && reset) /* change RQ type according to priv->xdp_prog */
mlx5e_set_rq_type(priv->mdev, &priv->channels.params);
- if (was_opened && reset)
- err = mlx5e_open_locked(netdev);
-
- if (!test_bit(MLX5E_STATE_OPENED, &priv->state) || reset)
+ if (!was_opened || reset)
goto unlock;
/* exchanging programs w/o reset, we update ref counts on behalf
@@ -4224,19 +4404,29 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
*/
for (i = 0; i < priv->channels.num; i++) {
struct mlx5e_channel *c = priv->channels.c[i];
+ bool xsk_open = test_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
clear_bit(MLX5E_RQ_STATE_ENABLED, &c->rq.state);
+ if (xsk_open)
+ clear_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state);
napi_synchronize(&c->napi);
/* prevent mlx5e_poll_rx_cq from accessing rq->xdp_prog */
old_prog = xchg(&c->rq.xdp_prog, prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+
+ if (xsk_open) {
+ old_prog = xchg(&c->xskrq.xdp_prog, prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ }
set_bit(MLX5E_RQ_STATE_ENABLED, &c->rq.state);
+ if (xsk_open)
+ set_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state);
/* napi_schedule in case we have missed anything */
napi_schedule(&c->napi);
-
- if (old_prog)
- bpf_prog_put(old_prog);
}
unlock:
@@ -4267,6 +4457,9 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp)
case XDP_QUERY_PROG:
xdp->prog_id = mlx5e_xdp_query(dev);
return 0;
+ case XDP_SETUP_XSK_UMEM:
+ return mlx5e_xsk_setup_umem(dev, xdp->xsk.umem,
+ xdp->xsk.queue_id);
default:
return -EINVAL;
}
@@ -4349,6 +4542,7 @@ const struct net_device_ops mlx5e_netdev_ops = {
.ndo_tx_timeout = mlx5e_tx_timeout,
.ndo_bpf = mlx5e_xdp,
.ndo_xdp_xmit = mlx5e_xdp_xmit,
+ .ndo_xsk_async_xmit = mlx5e_xsk_async_xmit,
#ifdef CONFIG_MLX5_EN_ARFS
.ndo_rx_flow_steer = mlx5e_rx_flow_steer,
#endif
@@ -4500,11 +4694,13 @@ void mlx5e_build_rq_params(struct mlx5_core_dev *mdev,
* - Striding RQ configuration is not possible/supported.
* - Slow PCI heuristic.
* - Legacy RQ would use linear SKB while Striding RQ would use non-linear.
+ *
+ * No XSK params: checking the availability of striding RQ in general.
*/
if (!slow_pci_heuristic(mdev) &&
mlx5e_striding_rq_possible(mdev, params) &&
- (mlx5e_rx_mpwqe_is_linear_skb(mdev, params) ||
- !mlx5e_rx_is_linear_skb(params)))
+ (mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL) ||
+ !mlx5e_rx_is_linear_skb(params, NULL)))
MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, true);
mlx5e_set_rq_type(mdev, params);
mlx5e_init_rq_type_params(mdev, params);
@@ -4526,6 +4722,7 @@ void mlx5e_build_rss_params(struct mlx5e_rss_params *rss_params,
}
void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
+ struct mlx5e_xsk *xsk,
struct mlx5e_rss_params *rss_params,
struct mlx5e_params *params,
u16 max_channels, u16 mtu)
@@ -4561,9 +4758,11 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
/* HW LRO */
/* TODO: && MLX5_CAP_ETH(mdev, lro_cap) */
- if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
- if (!mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
+ if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+ /* No XSK params: checking the availability of striding RQ in general. */
+ if (!mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL))
params->lro_en = !slow_pci_heuristic(mdev);
+ }
params->lro_timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT);
/* CQ moderation params */
@@ -4582,6 +4781,9 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
mlx5e_build_rss_params(rss_params, params->num_channels);
params->tunneled_offload_en =
mlx5e_tunnel_inner_ft_supported(mdev);
+
+ /* AF_XDP */
+ params->xsk = xsk;
}
static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
@@ -4757,7 +4959,7 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev,
if (err)
return err;
- mlx5e_build_nic_params(mdev, rss, &priv->channels.params,
+ mlx5e_build_nic_params(mdev, &priv->xsk, rss, &priv->channels.params,
mlx5e_get_netdev_max_channels(netdev),
netdev->mtu);
@@ -4799,7 +5001,7 @@ static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
if (err)
goto err_close_drop_rq;
- err = mlx5e_create_direct_rqts(priv);
+ err = mlx5e_create_direct_rqts(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_rqts;
@@ -4807,14 +5009,22 @@ static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
if (err)
goto err_destroy_direct_rqts;
- err = mlx5e_create_direct_tirs(priv);
+ err = mlx5e_create_direct_tirs(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_tirs;
+ err = mlx5e_create_direct_rqts(priv, priv->xsk_tir);
+ if (unlikely(err))
+ goto err_destroy_direct_tirs;
+
+ err = mlx5e_create_direct_tirs(priv, priv->xsk_tir);
+ if (unlikely(err))
+ goto err_destroy_xsk_rqts;
+
err = mlx5e_create_flow_steering(priv);
if (err) {
mlx5_core_warn(mdev, "create flow steering failed, %d\n", err);
- goto err_destroy_direct_tirs;
+ goto err_destroy_xsk_tirs;
}
err = mlx5e_tc_nic_init(priv);
@@ -4825,12 +5035,16 @@ static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
err_destroy_flow_steering:
mlx5e_destroy_flow_steering(priv);
+err_destroy_xsk_tirs:
+ mlx5e_destroy_direct_tirs(priv, priv->xsk_tir);
+err_destroy_xsk_rqts:
+ mlx5e_destroy_direct_rqts(priv, priv->xsk_tir);
err_destroy_direct_tirs:
- mlx5e_destroy_direct_tirs(priv);
+ mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
err_destroy_indirect_tirs:
mlx5e_destroy_indirect_tirs(priv, true);
err_destroy_direct_rqts:
- mlx5e_destroy_direct_rqts(priv);
+ mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
err_destroy_indirect_rqts:
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
err_close_drop_rq:
@@ -4844,9 +5058,11 @@ static void mlx5e_cleanup_nic_rx(struct mlx5e_priv *priv)
{
mlx5e_tc_nic_cleanup(priv);
mlx5e_destroy_flow_steering(priv);
- mlx5e_destroy_direct_tirs(priv);
+ mlx5e_destroy_direct_tirs(priv, priv->xsk_tir);
+ mlx5e_destroy_direct_rqts(priv, priv->xsk_tir);
+ mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
mlx5e_destroy_indirect_tirs(priv, true);
- mlx5e_destroy_direct_rqts(priv);
+ mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
mlx5e_close_drop_rq(&priv->drop_rq);
mlx5e_destroy_q_counters(priv);
@@ -5002,7 +5218,7 @@ struct net_device *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv),
nch * profile->max_tc,
- nch);
+ nch * MLX5E_NUM_RQ_GROUPS);
if (!netdev) {
mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n");
return NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 330034fcdfc5..6a013a8c1150 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1510,7 +1510,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
if (err)
goto err_close_drop_rq;
- err = mlx5e_create_direct_rqts(priv);
+ err = mlx5e_create_direct_rqts(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_rqts;
@@ -1518,7 +1518,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
if (err)
goto err_destroy_direct_rqts;
- err = mlx5e_create_direct_tirs(priv);
+ err = mlx5e_create_direct_tirs(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_tirs;
@@ -1535,11 +1535,11 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
err_destroy_ttc_table:
mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
err_destroy_direct_tirs:
- mlx5e_destroy_direct_tirs(priv);
+ mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
err_destroy_indirect_tirs:
mlx5e_destroy_indirect_tirs(priv, false);
err_destroy_direct_rqts:
- mlx5e_destroy_direct_rqts(priv);
+ mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
err_destroy_indirect_rqts:
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
err_close_drop_rq:
@@ -1553,9 +1553,9 @@ static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
mlx5_del_flow_rules(rpriv->vport_rx_rule);
mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
- mlx5e_destroy_direct_tirs(priv);
+ mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
mlx5e_destroy_indirect_tirs(priv, false);
- mlx5e_destroy_direct_rqts(priv);
+ mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
mlx5e_close_drop_rq(&priv->drop_rq);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 234a3fd39901..56a2f4666c47 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -47,6 +47,7 @@
#include "en_accel/tls_rxtx.h"
#include "lib/clock.h"
#include "en/xdp.h"
+#include "en/xsk/rx.h"
static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
{
@@ -235,8 +236,8 @@ static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
return true;
}
-static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
- struct mlx5e_dma_info *dma_info)
+static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info)
{
if (mlx5e_rx_cache_get(rq, dma_info))
return 0;
@@ -256,13 +257,23 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
return 0;
}
+static inline int mlx5e_page_alloc(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info)
+{
+ if (rq->umem)
+ return mlx5e_xsk_page_alloc_umem(rq, dma_info);
+ else
+ return mlx5e_page_alloc_pool(rq, dma_info);
+}
+
void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info)
{
dma_unmap_page(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir);
}
-void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
- bool recycle)
+void mlx5e_page_release_dynamic(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info,
+ bool recycle)
{
if (likely(recycle)) {
if (mlx5e_rx_cache_put(rq, dma_info))
@@ -277,6 +288,20 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
}
}
+static inline void mlx5e_page_release(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info,
+ bool recycle)
+{
+ if (rq->umem)
+ /* The `recycle` parameter is ignored, and the page is always
+ * put into the Reuse Ring, because there is no way to return
+ * the page to the userspace when the interface goes down.
+ */
+ mlx5e_xsk_page_release(rq, dma_info);
+ else
+ mlx5e_page_release_dynamic(rq, dma_info, recycle);
+}
+
static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *frag)
{
@@ -288,7 +313,7 @@ static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
* offset) should just use the new one without replenishing again
* by themselves.
*/
- err = mlx5e_page_alloc_mapped(rq, frag->di);
+ err = mlx5e_page_alloc(rq, frag->di);
return err;
}
@@ -354,6 +379,13 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
int err;
int i;
+ if (rq->umem) {
+ int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags;
+
+ if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired)))
+ return -ENOMEM;
+ }
+
for (i = 0; i < wqe_bulk; i++) {
struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, ix + i);
@@ -401,11 +433,17 @@ mlx5e_copy_skb_header(struct device *pdev, struct sk_buff *skb,
static void
mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, bool recycle)
{
- const bool no_xdp_xmit =
- bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
+ bool no_xdp_xmit;
struct mlx5e_dma_info *dma_info = wi->umr.dma_info;
int i;
+ /* A common case for AF_XDP. */
+ if (bitmap_full(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE))
+ return;
+
+ no_xdp_xmit = bitmap_empty(wi->xdp_xmit_bitmap,
+ MLX5_MPWRQ_PAGES_PER_WQE);
+
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++)
if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
mlx5e_page_release(rq, &dma_info[i], recycle);
@@ -427,11 +465,6 @@ static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq, u8 n)
mlx5_wq_ll_update_db_record(wq);
}
-static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq)
-{
- return mlx5_wq_cyc_get_ctr_wrap_cnt(&sq->wq, sq->pc);
-}
-
static inline void mlx5e_fill_icosq_frag_edge(struct mlx5e_icosq *sq,
struct mlx5_wq_cyc *wq,
u16 pi, u16 nnops)
@@ -459,6 +492,12 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
int err;
int i;
+ if (rq->umem &&
+ unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) {
+ err = -ENOMEM;
+ goto err;
+ }
+
pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi);
if (unlikely(contig_wqebbs_room < MLX5E_UMR_WQEBBS)) {
@@ -467,12 +506,10 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
}
umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
- if (unlikely(mlx5e_icosq_wrap_cnt(sq) < 2))
- memcpy(umr_wqe, &rq->mpwqe.umr_wqe,
- offsetof(struct mlx5e_umr_wqe, inline_mtts));
+ memcpy(umr_wqe, &rq->mpwqe.umr_wqe, offsetof(struct mlx5e_umr_wqe, inline_mtts));
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
- err = mlx5e_page_alloc_mapped(rq, dma_info);
+ err = mlx5e_page_alloc(rq, dma_info);
if (unlikely(err))
goto err_unmap;
umr_wqe->inline_mtts[i].ptag = cpu_to_be64(dma_info->addr | MLX5_EN_WR);
@@ -487,6 +524,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset);
sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR;
+ sq->db.ico_wqe[pi].umr.rq = rq;
sq->pc += MLX5E_UMR_WQEBBS;
sq->doorbell_cseg = &umr_wqe->ctrl;
@@ -498,6 +536,8 @@ err_unmap:
dma_info--;
mlx5e_page_release(rq, dma_info, true);
}
+
+err:
rq->stats->buff_alloc_err++;
return err;
@@ -544,11 +584,10 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
return !!err;
}
-static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
+void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
{
struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq);
struct mlx5_cqe64 *cqe;
- u8 completed_umr = 0;
u16 sqcc;
int i;
@@ -589,7 +628,7 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
if (likely(wi->opcode == MLX5_OPCODE_UMR)) {
sqcc += MLX5E_UMR_WQEBBS;
- completed_umr++;
+ wi->umr.rq->mpwqe.umr_completed++;
} else if (likely(wi->opcode == MLX5_OPCODE_NOP)) {
sqcc++;
} else {
@@ -605,24 +644,25 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
sq->cc = sqcc;
mlx5_cqwq_update_db_record(&cq->wq);
-
- if (likely(completed_umr)) {
- mlx5e_post_rx_mpwqe(rq, completed_umr);
- rq->mpwqe.umr_in_progress -= completed_umr;
- }
}
bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
{
struct mlx5e_icosq *sq = &rq->channel->icosq;
struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
+ u8 umr_completed = rq->mpwqe.umr_completed;
+ int alloc_err = 0;
u8 missing, i;
u16 head;
if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
return false;
- mlx5e_poll_ico_cq(&sq->cq, rq);
+ if (umr_completed) {
+ mlx5e_post_rx_mpwqe(rq, umr_completed);
+ rq->mpwqe.umr_in_progress -= umr_completed;
+ rq->mpwqe.umr_completed = 0;
+ }
missing = mlx5_wq_ll_missing(wq) - rq->mpwqe.umr_in_progress;
@@ -636,7 +676,9 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
head = rq->mpwqe.actual_wq_head;
i = missing;
do {
- if (unlikely(mlx5e_alloc_rx_mpwqe(rq, head)))
+ alloc_err = mlx5e_alloc_rx_mpwqe(rq, head);
+
+ if (unlikely(alloc_err))
break;
head = mlx5_wq_ll_get_wqe_next_ix(wq, head);
} while (--i);
@@ -650,6 +692,12 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
rq->mpwqe.umr_in_progress += rq->mpwqe.umr_last_bulk;
rq->mpwqe.actual_wq_head = head;
+ /* If XSK Fill Ring doesn't have enough frames, busy poll by
+ * rescheduling the NAPI poll.
+ */
+ if (unlikely(alloc_err == -ENOMEM && rq->umem))
+ return true;
+
return false;
}
@@ -1018,7 +1066,7 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
}
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt);
+ consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false);
rcu_read_unlock();
if (consumed)
return NULL; /* page/packet was consumed by XDP */
@@ -1235,7 +1283,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
prefetch(data);
rcu_read_lock();
- consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32);
+ consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false);
rcu_read_unlock();
if (consumed) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 483d321d2151..5f540db47cc9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -104,7 +104,33 @@ static const struct counter_desc sw_stats_desc[] = {
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_poll) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_arm) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_aff_change) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_force_irq) },
{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, ch_eq_rearm) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_packets) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_bytes) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_csum_complete) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_csum_unnecessary) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_csum_unnecessary_inner) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_csum_none) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_ecn_mark) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_removed_vlan_packets) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_xdp_drop) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_xdp_redirect) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_wqe_err) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_mpwqe_filler_cqes) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_mpwqe_filler_strides) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_oversize_pkts_sw_drop) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_buff_alloc_err) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_cqe_compress_blks) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_cqe_compress_pkts) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_congst_umr) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_xsk_arfs_err) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_xmit) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_mpwqe) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_inlnw) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_full) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_err) },
+ { MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xsk_cqes) },
};
#define NUM_SW_COUNTERS ARRAY_SIZE(sw_stats_desc)
@@ -144,6 +170,8 @@ static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
&priv->channel_stats[i];
struct mlx5e_xdpsq_stats *xdpsq_red_stats = &channel_stats->xdpsq;
struct mlx5e_xdpsq_stats *xdpsq_stats = &channel_stats->rq_xdpsq;
+ struct mlx5e_xdpsq_stats *xsksq_stats = &channel_stats->xsksq;
+ struct mlx5e_rq_stats *xskrq_stats = &channel_stats->xskrq;
struct mlx5e_rq_stats *rq_stats = &channel_stats->rq;
struct mlx5e_ch_stats *ch_stats = &channel_stats->ch;
int j;
@@ -186,6 +214,7 @@ static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
s->ch_poll += ch_stats->poll;
s->ch_arm += ch_stats->arm;
s->ch_aff_change += ch_stats->aff_change;
+ s->ch_force_irq += ch_stats->force_irq;
s->ch_eq_rearm += ch_stats->eq_rearm;
/* xdp redirect */
s->tx_xdp_xmit += xdpsq_red_stats->xmit;
@@ -194,6 +223,32 @@ static void mlx5e_grp_sw_update_stats(struct mlx5e_priv *priv)
s->tx_xdp_full += xdpsq_red_stats->full;
s->tx_xdp_err += xdpsq_red_stats->err;
s->tx_xdp_cqes += xdpsq_red_stats->cqes;
+ /* AF_XDP zero-copy */
+ s->rx_xsk_packets += xskrq_stats->packets;
+ s->rx_xsk_bytes += xskrq_stats->bytes;
+ s->rx_xsk_csum_complete += xskrq_stats->csum_complete;
+ s->rx_xsk_csum_unnecessary += xskrq_stats->csum_unnecessary;
+ s->rx_xsk_csum_unnecessary_inner += xskrq_stats->csum_unnecessary_inner;
+ s->rx_xsk_csum_none += xskrq_stats->csum_none;
+ s->rx_xsk_ecn_mark += xskrq_stats->ecn_mark;
+ s->rx_xsk_removed_vlan_packets += xskrq_stats->removed_vlan_packets;
+ s->rx_xsk_xdp_drop += xskrq_stats->xdp_drop;
+ s->rx_xsk_xdp_redirect += xskrq_stats->xdp_redirect;
+ s->rx_xsk_wqe_err += xskrq_stats->wqe_err;
+ s->rx_xsk_mpwqe_filler_cqes += xskrq_stats->mpwqe_filler_cqes;
+ s->rx_xsk_mpwqe_filler_strides += xskrq_stats->mpwqe_filler_strides;
+ s->rx_xsk_oversize_pkts_sw_drop += xskrq_stats->oversize_pkts_sw_drop;
+ s->rx_xsk_buff_alloc_err += xskrq_stats->buff_alloc_err;
+ s->rx_xsk_cqe_compress_blks += xskrq_stats->cqe_compress_blks;
+ s->rx_xsk_cqe_compress_pkts += xskrq_stats->cqe_compress_pkts;
+ s->rx_xsk_congst_umr += xskrq_stats->congst_umr;
+ s->rx_xsk_arfs_err += xskrq_stats->arfs_err;
+ s->tx_xsk_xmit += xsksq_stats->xmit;
+ s->tx_xsk_mpwqe += xsksq_stats->mpwqe;
+ s->tx_xsk_inlnw += xsksq_stats->inlnw;
+ s->tx_xsk_full += xsksq_stats->full;
+ s->tx_xsk_err += xsksq_stats->err;
+ s->tx_xsk_cqes += xsksq_stats->cqes;
for (j = 0; j < priv->max_opened_tc; j++) {
struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
@@ -1266,11 +1321,43 @@ static const struct counter_desc xdpsq_stats_desc[] = {
{ MLX5E_DECLARE_XDPSQ_STAT(struct mlx5e_xdpsq_stats, cqes) },
};
+static const struct counter_desc xskrq_stats_desc[] = {
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, packets) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, bytes) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, csum_complete) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, csum_unnecessary) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, csum_unnecessary_inner) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, csum_none) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, ecn_mark) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, removed_vlan_packets) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, xdp_drop) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, xdp_redirect) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, wqe_err) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, mpwqe_filler_cqes) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, mpwqe_filler_strides) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, oversize_pkts_sw_drop) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, buff_alloc_err) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, congst_umr) },
+ { MLX5E_DECLARE_XSKRQ_STAT(struct mlx5e_rq_stats, arfs_err) },
+};
+
+static const struct counter_desc xsksq_stats_desc[] = {
+ { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, xmit) },
+ { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, mpwqe) },
+ { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, inlnw) },
+ { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, full) },
+ { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, err) },
+ { MLX5E_DECLARE_XSKSQ_STAT(struct mlx5e_xdpsq_stats, cqes) },
+};
+
static const struct counter_desc ch_stats_desc[] = {
{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, events) },
{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, poll) },
{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, arm) },
{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, aff_change) },
+ { MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, force_irq) },
{ MLX5E_DECLARE_CH_STAT(struct mlx5e_ch_stats, eq_rearm) },
};
@@ -1278,6 +1365,8 @@ static const struct counter_desc ch_stats_desc[] = {
#define NUM_SQ_STATS ARRAY_SIZE(sq_stats_desc)
#define NUM_XDPSQ_STATS ARRAY_SIZE(xdpsq_stats_desc)
#define NUM_RQ_XDPSQ_STATS ARRAY_SIZE(rq_xdpsq_stats_desc)
+#define NUM_XSKRQ_STATS ARRAY_SIZE(xskrq_stats_desc)
+#define NUM_XSKSQ_STATS ARRAY_SIZE(xsksq_stats_desc)
#define NUM_CH_STATS ARRAY_SIZE(ch_stats_desc)
static int mlx5e_grp_channels_get_num_stats(struct mlx5e_priv *priv)
@@ -1288,13 +1377,16 @@ static int mlx5e_grp_channels_get_num_stats(struct mlx5e_priv *priv)
(NUM_CH_STATS * max_nch) +
(NUM_SQ_STATS * max_nch * priv->max_opened_tc) +
(NUM_RQ_XDPSQ_STATS * max_nch) +
- (NUM_XDPSQ_STATS * max_nch);
+ (NUM_XDPSQ_STATS * max_nch) +
+ (NUM_XSKRQ_STATS * max_nch * priv->xsk.ever_used) +
+ (NUM_XSKSQ_STATS * max_nch * priv->xsk.ever_used);
}
static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
int idx)
{
int max_nch = mlx5e_get_netdev_max_channels(priv->netdev);
+ bool is_xsk = priv->xsk.ever_used;
int i, j, tc;
for (i = 0; i < max_nch; i++)
@@ -1306,6 +1398,9 @@ static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
for (j = 0; j < NUM_RQ_STATS; j++)
sprintf(data + (idx++) * ETH_GSTRING_LEN,
rq_stats_desc[j].format, i);
+ for (j = 0; j < NUM_XSKRQ_STATS * is_xsk; j++)
+ sprintf(data + (idx++) * ETH_GSTRING_LEN,
+ xskrq_stats_desc[j].format, i);
for (j = 0; j < NUM_RQ_XDPSQ_STATS; j++)
sprintf(data + (idx++) * ETH_GSTRING_LEN,
rq_xdpsq_stats_desc[j].format, i);
@@ -1318,10 +1413,14 @@ static int mlx5e_grp_channels_fill_strings(struct mlx5e_priv *priv, u8 *data,
sq_stats_desc[j].format,
priv->channel_tc2txq[i][tc]);
- for (i = 0; i < max_nch; i++)
+ for (i = 0; i < max_nch; i++) {
+ for (j = 0; j < NUM_XSKSQ_STATS * is_xsk; j++)
+ sprintf(data + (idx++) * ETH_GSTRING_LEN,
+ xsksq_stats_desc[j].format, i);
for (j = 0; j < NUM_XDPSQ_STATS; j++)
sprintf(data + (idx++) * ETH_GSTRING_LEN,
xdpsq_stats_desc[j].format, i);
+ }
return idx;
}
@@ -1330,6 +1429,7 @@ static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data,
int idx)
{
int max_nch = mlx5e_get_netdev_max_channels(priv->netdev);
+ bool is_xsk = priv->xsk.ever_used;
int i, j, tc;
for (i = 0; i < max_nch; i++)
@@ -1343,6 +1443,10 @@ static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data,
data[idx++] =
MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].rq,
rq_stats_desc, j);
+ for (j = 0; j < NUM_XSKRQ_STATS * is_xsk; j++)
+ data[idx++] =
+ MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].xskrq,
+ xskrq_stats_desc, j);
for (j = 0; j < NUM_RQ_XDPSQ_STATS; j++)
data[idx++] =
MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].rq_xdpsq,
@@ -1356,11 +1460,16 @@ static int mlx5e_grp_channels_fill_stats(struct mlx5e_priv *priv, u64 *data,
MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].sq[tc],
sq_stats_desc, j);
- for (i = 0; i < max_nch; i++)
+ for (i = 0; i < max_nch; i++) {
+ for (j = 0; j < NUM_XSKSQ_STATS * is_xsk; j++)
+ data[idx++] =
+ MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].xsksq,
+ xsksq_stats_desc, j);
for (j = 0; j < NUM_XDPSQ_STATS; j++)
data[idx++] =
MLX5E_READ_CTR64_CPU(&priv->channel_stats[i].xdpsq,
xdpsq_stats_desc, j);
+ }
return idx;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index cdddcc46971b..fb3ad7231e11 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -46,6 +46,8 @@
#define MLX5E_DECLARE_TX_STAT(type, fld) "tx%d_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_XDPSQ_STAT(type, fld) "tx%d_xdp_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_RQ_XDPSQ_STAT(type, fld) "rx%d_xdp_tx_"#fld, offsetof(type, fld)
+#define MLX5E_DECLARE_XSKRQ_STAT(type, fld) "rx%d_xsk_"#fld, offsetof(type, fld)
+#define MLX5E_DECLARE_XSKSQ_STAT(type, fld) "tx%d_xsk_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_CH_STAT(type, fld) "ch%d_"#fld, offsetof(type, fld)
struct counter_desc {
@@ -116,12 +118,39 @@ struct mlx5e_sw_stats {
u64 ch_poll;
u64 ch_arm;
u64 ch_aff_change;
+ u64 ch_force_irq;
u64 ch_eq_rearm;
#ifdef CONFIG_MLX5_EN_TLS
u64 tx_tls_ooo;
u64 tx_tls_resync_bytes;
#endif
+
+ u64 rx_xsk_packets;
+ u64 rx_xsk_bytes;
+ u64 rx_xsk_csum_complete;
+ u64 rx_xsk_csum_unnecessary;
+ u64 rx_xsk_csum_unnecessary_inner;
+ u64 rx_xsk_csum_none;
+ u64 rx_xsk_ecn_mark;
+ u64 rx_xsk_removed_vlan_packets;
+ u64 rx_xsk_xdp_drop;
+ u64 rx_xsk_xdp_redirect;
+ u64 rx_xsk_wqe_err;
+ u64 rx_xsk_mpwqe_filler_cqes;
+ u64 rx_xsk_mpwqe_filler_strides;
+ u64 rx_xsk_oversize_pkts_sw_drop;
+ u64 rx_xsk_buff_alloc_err;
+ u64 rx_xsk_cqe_compress_blks;
+ u64 rx_xsk_cqe_compress_pkts;
+ u64 rx_xsk_congst_umr;
+ u64 rx_xsk_arfs_err;
+ u64 tx_xsk_xmit;
+ u64 tx_xsk_mpwqe;
+ u64 tx_xsk_inlnw;
+ u64 tx_xsk_full;
+ u64 tx_xsk_err;
+ u64 tx_xsk_cqes;
};
struct mlx5e_qcounter_stats {
@@ -256,6 +285,7 @@ struct mlx5e_ch_stats {
u64 poll;
u64 arm;
u64 aff_change;
+ u64 force_irq;
u64 eq_rearm;
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index e6c434efbd46..3c98b7fe2923 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -33,6 +33,7 @@
#include <linux/irq.h>
#include "en.h"
#include "en/xdp.h"
+#include "en/xsk/tx.h"
static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
{
@@ -85,7 +86,12 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
napi);
struct mlx5e_ch_stats *ch_stats = c->stats;
+ struct mlx5e_xdpsq *xsksq = &c->xsksq;
+ struct mlx5e_rq *xskrq = &c->xskrq;
struct mlx5e_rq *rq = &c->rq;
+ bool xsk_open = test_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
+ bool aff_change = false;
+ bool busy_xsk = false;
bool busy = false;
int work_done = 0;
int i;
@@ -95,22 +101,38 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
for (i = 0; i < c->num_tc; i++)
busy |= mlx5e_poll_tx_cq(&c->sq[i].cq, budget);
- busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq, NULL);
+ busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq);
if (c->xdp)
- busy |= mlx5e_poll_xdpsq_cq(&rq->xdpsq.cq, rq);
+ busy |= mlx5e_poll_xdpsq_cq(&c->rq_xdpsq.cq);
if (likely(budget)) { /* budget=0 means: don't poll rx rings */
- work_done = mlx5e_poll_rx_cq(&rq->cq, budget);
+ if (xsk_open)
+ work_done = mlx5e_poll_rx_cq(&xskrq->cq, budget);
+
+ if (likely(budget - work_done))
+ work_done += mlx5e_poll_rx_cq(&rq->cq, budget - work_done);
+
busy |= work_done == budget;
}
- busy |= c->rq.post_wqes(rq);
+ mlx5e_poll_ico_cq(&c->icosq.cq);
+
+ busy |= rq->post_wqes(rq);
+ if (xsk_open) {
+ mlx5e_poll_ico_cq(&c->xskicosq.cq);
+ busy |= mlx5e_poll_xdpsq_cq(&xsksq->cq);
+ busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
+ busy_xsk |= xskrq->post_wqes(xskrq);
+ }
+
+ busy |= busy_xsk;
if (busy) {
if (likely(mlx5e_channel_no_affinity_change(c)))
return budget;
ch_stats->aff_change++;
+ aff_change = true;
if (budget && work_done == budget)
work_done--;
}
@@ -131,6 +153,18 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
mlx5e_cq_arm(&c->icosq.cq);
mlx5e_cq_arm(&c->xdpsq.cq);
+ if (xsk_open) {
+ mlx5e_handle_rx_dim(xskrq);
+ mlx5e_cq_arm(&c->xskicosq.cq);
+ mlx5e_cq_arm(&xsksq->cq);
+ mlx5e_cq_arm(&xskrq->cq);
+ }
+
+ if (unlikely(aff_change && busy_xsk)) {
+ mlx5e_trigger_irq(&c->icosq);
+ ch_stats->force_irq++;
+ }
+
return work_done;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index e68d124eb625..00e66c3772cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -87,7 +87,7 @@ int mlx5i_init(struct mlx5_core_dev *mdev,
mlx5e_set_netdev_mtu_boundaries(priv);
netdev->mtu = netdev->max_mtu;
- mlx5e_build_nic_params(mdev, &priv->rss_params, &priv->channels.params,
+ mlx5e_build_nic_params(mdev, NULL, &priv->rss_params, &priv->channels.params,
mlx5e_get_netdev_max_channels(netdev),
netdev->mtu);
mlx5i_build_nic_params(mdev, &priv->channels.params);
@@ -365,7 +365,7 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv)
if (err)
goto err_close_drop_rq;
- err = mlx5e_create_direct_rqts(priv);
+ err = mlx5e_create_direct_rqts(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_rqts;
@@ -373,7 +373,7 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv)
if (err)
goto err_destroy_direct_rqts;
- err = mlx5e_create_direct_tirs(priv);
+ err = mlx5e_create_direct_tirs(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_tirs;
@@ -384,11 +384,11 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv)
return 0;
err_destroy_direct_tirs:
- mlx5e_destroy_direct_tirs(priv);
+ mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
err_destroy_indirect_tirs:
mlx5e_destroy_indirect_tirs(priv, true);
err_destroy_direct_rqts:
- mlx5e_destroy_direct_rqts(priv);
+ mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
err_destroy_indirect_rqts:
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
err_close_drop_rq:
@@ -401,9 +401,9 @@ err_destroy_q_counters:
static void mlx5i_cleanup_rx(struct mlx5e_priv *priv)
{
mlx5i_destroy_flow_steering(priv);
- mlx5e_destroy_direct_tirs(priv);
+ mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
mlx5e_destroy_indirect_tirs(priv, true);
- mlx5e_destroy_direct_rqts(priv);
+ mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
mlx5e_close_drop_rq(&priv->drop_rq);
mlx5e_destroy_q_counters(priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index 1f87cce421e0..f1ec58c9e9e3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -134,11 +134,6 @@ static inline void mlx5_wq_cyc_update_db_record(struct mlx5_wq_cyc *wq)
*wq->db = cpu_to_be32(wq->wqe_ctr);
}
-static inline u16 mlx5_wq_cyc_get_ctr_wrap_cnt(struct mlx5_wq_cyc *wq, u16 ctr)
-{
- return ctr >> wq->fbc.log_sz;
-}
-
static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
{
return ctr & wq->fbc.sz_m1;
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index c6916bf1017b..9f3c839f9e5f 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -38,6 +38,8 @@
#define VETH_XDP_TX BIT(0)
#define VETH_XDP_REDIR BIT(1)
+#define VETH_XDP_TX_BULK_SIZE 16
+
struct veth_rq_stats {
u64 xdp_packets;
u64 xdp_bytes;
@@ -64,6 +66,11 @@ struct veth_priv {
unsigned int requested_headroom;
};
+struct veth_xdp_tx_bq {
+ struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
+ unsigned int count;
+};
+
/*
* ethtool interface
*/
@@ -442,13 +449,30 @@ drop:
return ret;
}
-static void veth_xdp_flush(struct net_device *dev)
+static void veth_xdp_flush_bq(struct net_device *dev, struct veth_xdp_tx_bq *bq)
+{
+ int sent, i, err = 0;
+
+ sent = veth_xdp_xmit(dev, bq->count, bq->q, 0);
+ if (sent < 0) {
+ err = sent;
+ sent = 0;
+ for (i = 0; i < bq->count; i++)
+ xdp_return_frame(bq->q[i]);
+ }
+ trace_xdp_bulk_tx(dev, sent, bq->count - sent, err);
+
+ bq->count = 0;
+}
+
+static void veth_xdp_flush(struct net_device *dev, struct veth_xdp_tx_bq *bq)
{
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
struct net_device *rcv;
struct veth_rq *rq;
rcu_read_lock();
+ veth_xdp_flush_bq(dev, bq);
rcv = rcu_dereference(priv->peer);
if (unlikely(!rcv))
goto out;
@@ -464,19 +488,26 @@ out:
rcu_read_unlock();
}
-static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
+static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp,
+ struct veth_xdp_tx_bq *bq)
{
struct xdp_frame *frame = convert_to_xdp_frame(xdp);
if (unlikely(!frame))
return -EOVERFLOW;
- return veth_xdp_xmit(dev, 1, &frame, 0);
+ if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
+ veth_xdp_flush_bq(dev, bq);
+
+ bq->q[bq->count++] = frame;
+
+ return 0;
}
static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
struct xdp_frame *frame,
- unsigned int *xdp_xmit)
+ unsigned int *xdp_xmit,
+ struct veth_xdp_tx_bq *bq)
{
void *hard_start = frame->data - frame->headroom;
void *head = hard_start - sizeof(struct xdp_frame);
@@ -509,7 +540,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
orig_frame = *frame;
xdp.data_hard_start = head;
xdp.rxq->mem = frame->mem;
- if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
+ if (unlikely(veth_xdp_tx(rq->dev, &xdp, bq) < 0)) {
trace_xdp_exception(rq->dev, xdp_prog, act);
frame = &orig_frame;
goto err_xdp;
@@ -560,7 +591,8 @@ xdp_xmit:
}
static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
- unsigned int *xdp_xmit)
+ unsigned int *xdp_xmit,
+ struct veth_xdp_tx_bq *bq)
{
u32 pktlen, headroom, act, metalen;
void *orig_data, *orig_data_end;
@@ -636,7 +668,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
get_page(virt_to_page(xdp.data));
consume_skb(skb);
xdp.rxq->mem = rq->xdp_mem;
- if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
+ if (unlikely(veth_xdp_tx(rq->dev, &xdp, bq) < 0)) {
trace_xdp_exception(rq->dev, xdp_prog, act);
goto err_xdp;
}
@@ -691,7 +723,8 @@ xdp_xmit:
return NULL;
}
-static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit)
+static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit,
+ struct veth_xdp_tx_bq *bq)
{
int i, done = 0, drops = 0, bytes = 0;
@@ -707,11 +740,11 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit)
struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
bytes += frame->len;
- skb = veth_xdp_rcv_one(rq, frame, &xdp_xmit_one);
+ skb = veth_xdp_rcv_one(rq, frame, &xdp_xmit_one, bq);
} else {
skb = ptr;
bytes += skb->len;
- skb = veth_xdp_rcv_skb(rq, skb, &xdp_xmit_one);
+ skb = veth_xdp_rcv_skb(rq, skb, &xdp_xmit_one, bq);
}
*xdp_xmit |= xdp_xmit_one;
@@ -737,10 +770,13 @@ static int veth_poll(struct napi_struct *napi, int budget)
struct veth_rq *rq =
container_of(napi, struct veth_rq, xdp_napi);
unsigned int xdp_xmit = 0;
+ struct veth_xdp_tx_bq bq;
int done;
+ bq.count = 0;
+
xdp_set_return_frame_no_direct();
- done = veth_xdp_rcv(rq, budget, &xdp_xmit);
+ done = veth_xdp_rcv(rq, budget, &xdp_xmit, &bq);
if (done < budget && napi_complete_done(napi, done)) {
/* Write rx_notify_masked before reading ptr_ring */
@@ -752,7 +788,7 @@ static int veth_poll(struct napi_struct *napi, int budget)
}
if (xdp_xmit & VETH_XDP_TX)
- veth_xdp_flush(rq->dev);
+ veth_xdp_flush(rq->dev, &bq);
if (xdp_xmit & VETH_XDP_REDIR)
xdp_do_flush_map();
xdp_clear_return_frame_no_direct();
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index bd79ae32909a..169fd25f6bc2 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -124,6 +124,14 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
loff_t *ppos, void **new_buf,
enum bpf_attach_type type);
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level,
+ int *optname, char __user *optval,
+ int *optlen, char **kernel_optval);
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval,
+ int __user *optlen, int max_optlen,
+ int retval);
+
static inline enum bpf_cgroup_storage_type cgroup_storage_type(
struct bpf_map *map)
{
@@ -286,6 +294,38 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
__ret; \
})
+#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
+ kernel_optval) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled) \
+ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \
+ optname, optval, \
+ optlen, \
+ kernel_optval); \
+ __ret; \
+})
+
+#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled) \
+ get_user(__ret, optlen); \
+ __ret; \
+})
+
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \
+ max_optlen, retval) \
+({ \
+ int __ret = retval; \
+ if (cgroup_bpf_enabled) \
+ __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \
+ optname, optval, \
+ optlen, max_optlen, \
+ retval); \
+ __ret; \
+})
+
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog);
int cgroup_bpf_prog_detach(const union bpf_attr *attr,
@@ -357,6 +397,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; })
+#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
+ optlen, max_optlen, retval) ({ retval; })
+#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
+ kernel_optval) ({ 0; })
#define for_each_cgroup_storage_type(stype) for (; false; )
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a62e7889b0b6..18f4cc2c6acd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -518,6 +518,7 @@ struct bpf_prog_array {
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array *progs);
int bpf_prog_array_length(struct bpf_prog_array *progs);
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array);
int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
__u32 __user *prog_ids, u32 cnt);
@@ -1051,6 +1052,7 @@ extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;
extern const struct bpf_func_proto bpf_strtol_proto;
extern const struct bpf_func_proto bpf_strtoul_proto;
+extern const struct bpf_func_proto bpf_tcp_sock_proto;
/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5a9975678d6f..eec5aeeeaf92 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -30,6 +30,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable)
#ifdef CONFIG_CGROUP_BPF
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt)
#endif
#ifdef CONFIG_BPF_LIRC_MODE2
BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 43b45d6db36d..1fe53e78c7e3 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -578,8 +578,9 @@ struct bpf_skb_data_end {
};
struct bpf_redirect_info {
- u32 ifindex;
u32 flags;
+ u32 tgt_index;
+ void *tgt_value;
struct bpf_map *map;
struct bpf_map *map_to_flush;
u32 kern_flags;
@@ -1199,4 +1200,14 @@ struct bpf_sysctl_kern {
u64 tmp_reg;
};
+struct bpf_sockopt_kern {
+ struct sock *sk;
+ u8 *optval;
+ u8 *optval_end;
+ s32 level;
+ s32 optname;
+ s32 optlen;
+ s32 retval;
+};
+
#endif /* __LINUX_FILTER_H__ */
diff --git a/include/linux/list.h b/include/linux/list.h
index e951228db4b2..85c92555e31f 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next)
WRITE_ONCE(prev->next, next);
}
+/*
+ * Delete a list entry and clear the 'prev' pointer.
+ *
+ * This is a special-purpose list clearing method used in the networking code
+ * for lists allocated as per-cpu, where we don't want to incur the extra
+ * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
+ * needs to check the node 'prev' pointer instead of calling list_empty().
+ */
+static inline void __list_del_clearprev(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->prev = NULL;
+}
+
/**
* list_del - deletes entry from list.
* @entry: the element to delete from the list.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9d36cc88d043..e16d8a3fd3b4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2221,6 +2221,14 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}
+static inline void tcp_bpf_rtt(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTT_CB_FLAG))
+ tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
+}
+
#if IS_ENABLED(CONFIG_SMC)
extern struct static_key_false tcp_have_smc;
#endif
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index ae0f368a62bb..057b159ff8b9 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -77,10 +77,11 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
void xsk_flush(struct xdp_sock *xs);
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
/* Used from netdev driver */
+bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
void xsk_umem_discard_addr(struct xdp_umem *umem);
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
+bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
void xsk_umem_consume_tx_done(struct xdp_umem *umem);
struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
@@ -99,6 +100,16 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
}
/* Reuse-queue aware version of FILL queue helpers */
+static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
+{
+ struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+ if (rq->length >= cnt)
+ return true;
+
+ return xsk_umem_has_addrs(umem, cnt - rq->length);
+}
+
static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
{
struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
@@ -146,6 +157,11 @@ static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
return false;
}
+static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
+{
+ return false;
+}
+
static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
return NULL;
@@ -159,8 +175,8 @@ static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
}
-static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
- u32 *len)
+static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
+ struct xdp_desc *desc)
{
return false;
}
@@ -200,6 +216,11 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
return 0;
}
+static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
+{
+ return false;
+}
+
static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
{
return NULL;
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index bb5e380e2ef3..68899fdc985b 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -50,6 +50,35 @@ TRACE_EVENT(xdp_exception,
__entry->ifindex)
);
+TRACE_EVENT(xdp_bulk_tx,
+
+ TP_PROTO(const struct net_device *dev,
+ int sent, int drops, int err),
+
+ TP_ARGS(dev, sent, drops, err),
+
+ TP_STRUCT__entry(
+ __field(int, ifindex)
+ __field(u32, act)
+ __field(int, drops)
+ __field(int, sent)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __entry->ifindex = dev->ifindex;
+ __entry->act = XDP_TX;
+ __entry->drops = drops;
+ __entry->sent = sent;
+ __entry->err = err;
+ ),
+
+ TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d",
+ __entry->ifindex,
+ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+ __entry->sent, __entry->drops, __entry->err)
+);
+
DECLARE_EVENT_CLASS(xdp_redirect_template,
TP_PROTO(const struct net_device *dev,
@@ -146,9 +175,8 @@ struct _bpf_dtab_netdev {
#endif /* __DEVMAP_OBJ_TYPE */
#define devmap_ifindex(fwd, map) \
- (!fwd ? 0 : \
- ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \
- ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))
+ ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \
+ ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \
trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b077507efa3f..ead27aebf491 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_FLOW_DISSECTOR,
BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ BPF_PROG_TYPE_CGROUP_SOCKOPT,
};
enum bpf_attach_type {
@@ -194,6 +195,8 @@ enum bpf_attach_type {
BPF_CGROUP_SYSCTL,
BPF_CGROUP_UDP4_RECVMSG,
BPF_CGROUP_UDP6_RECVMSG,
+ BPF_CGROUP_GETSOCKOPT,
+ BPF_CGROUP_SETSOCKOPT,
__MAX_BPF_ATTACH_TYPE
};
@@ -1568,8 +1571,11 @@ union bpf_attr {
* but this is only implemented for native XDP (with driver
* support) as of this writing).
*
- * All values for *flags* are reserved for future usage, and must
- * be left at zero.
+ * The lower two bits of *flags* are used as the return code if
+ * the map lookup fails. This is so that the return value can be
+ * one of the XDP program return codes up to XDP_TX, as chosen by
+ * the caller. Any higher bits in the *flags* argument must be
+ * unset.
*
* When used to redirect packets to net devices, this helper
* provides a high performance increase over **bpf_redirect**\ ().
@@ -1764,6 +1770,7 @@ union bpf_attr {
* * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
* * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
* * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
*
* Therefore, this function can be used to clear a callback flag by
* setting the appropriate bit to zero. e.g. to disable the RTO
@@ -3066,6 +3073,12 @@ struct bpf_tcp_sock {
* sum(delta(snd_una)), or how many bytes
* were acked.
*/
+ __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
+ * total number of DSACK blocks received
+ */
+ __u32 delivered; /* Total data packets delivered incl. rexmits */
+ __u32 delivered_ce; /* Like the above but only ECE marked packets */
+ __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */
};
struct bpf_sock_tuple {
@@ -3308,7 +3321,8 @@ struct bpf_sock_ops {
#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
-#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently
+#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently
* supported cb flags
*/
@@ -3363,6 +3377,8 @@ enum {
BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
* socket transition to LISTEN state.
*/
+ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -3541,4 +3557,15 @@ struct bpf_sysctl {
*/
};
+struct bpf_sockopt {
+ __bpf_md_ptr(struct bpf_sock *, sk);
+ __bpf_md_ptr(void *, optval);
+ __bpf_md_ptr(void *, optval_end);
+
+ __s32 level;
+ __s32 optname;
+ __s32 optlen;
+ __s32 retval;
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index caed8b1614ff..faaa5ca2a117 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -46,6 +46,7 @@ struct xdp_mmap_offsets {
#define XDP_UMEM_FILL_RING 5
#define XDP_UMEM_COMPLETION_RING 6
#define XDP_STATISTICS 7
+#define XDP_OPTIONS 8
struct xdp_umem_reg {
__u64 addr; /* Start of packet data area */
@@ -60,6 +61,13 @@ struct xdp_statistics {
__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
};
+struct xdp_options {
+ __u32 flags;
+};
+
+/* Flags for the flags field of struct xdp_options */
+#define XDP_OPTIONS_ZEROCOPY (1 << 0)
+
/* Pgoff for mmaping the rings */
#define XDP_PGOFF_RX_RING 0
#define XDP_PGOFF_TX_RING 0x80000000
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c225c42e114a..76fa0076f20d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -15,6 +15,9 @@
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <net/sock.h>
+#include <net/bpf_sk_storage.h>
+
+#include "../cgroup/cgroup-internal.h"
DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
@@ -38,6 +41,8 @@ static void cgroup_bpf_release(struct work_struct *work)
struct bpf_prog_array *old_array;
unsigned int type;
+ mutex_lock(&cgroup_mutex);
+
for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog_list *pl, *tmp;
@@ -54,10 +59,12 @@ static void cgroup_bpf_release(struct work_struct *work)
}
old_array = rcu_dereference_protected(
cgrp->bpf.effective[type],
- percpu_ref_is_dying(&cgrp->bpf.refcnt));
+ lockdep_is_held(&cgroup_mutex));
bpf_prog_array_free(old_array);
}
+ mutex_unlock(&cgroup_mutex);
+
percpu_ref_exit(&cgrp->bpf.refcnt);
cgroup_put(cgrp);
}
@@ -229,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
err = compute_effective_progs(desc, type, &desc->bpf.inactive);
if (err)
goto cleanup;
@@ -238,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
+ if (unlikely(desc->bpf.inactive)) {
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+ continue;
+ }
+
activate_effective_progs(desc, type, desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -921,6 +939,188 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
+static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_prog_array *prog_array;
+ bool empty;
+
+ rcu_read_lock();
+ prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
+ empty = bpf_prog_array_is_empty(prog_array);
+ rcu_read_unlock();
+
+ return empty;
+}
+
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+{
+ if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+ return -EINVAL;
+
+ ctx->optval = kzalloc(max_optlen, GFP_USER);
+ if (!ctx->optval)
+ return -ENOMEM;
+
+ ctx->optval_end = ctx->optval + max_optlen;
+ ctx->optlen = max_optlen;
+
+ return 0;
+}
+
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+{
+ kfree(ctx->optval);
+}
+
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
+ int *optname, char __user *optval,
+ int *optlen, char **kernel_optval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = *level,
+ .optname = *optname,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+ return 0;
+
+ ret = sockopt_alloc_buf(&ctx, *optlen);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen == -1) {
+ /* optlen set to -1, bypass kernel */
+ ret = 1;
+ } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+ /* optlen is out of bounds */
+ ret = -EFAULT;
+ } else {
+ /* optlen within bounds, run kernel handler */
+ ret = 0;
+
+ /* export any potential modifications */
+ *level = ctx.level;
+ *optname = ctx.optname;
+ *optlen = ctx.optlen;
+ *kernel_optval = ctx.optval;
+ }
+
+out:
+ if (ret)
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
+
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval,
+ int __user *optlen, int max_optlen,
+ int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .retval = retval,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+ return retval;
+
+ ret = sockopt_alloc_buf(&ctx, max_optlen);
+ if (ret)
+ return ret;
+
+ if (!retval) {
+ /* If kernel getsockopt finished successfully,
+ * copy whatever was returned to the user back
+ * into our temporary buffer. Set optlen to the
+ * one that kernel returned as well to let
+ * BPF programs inspect the value.
+ */
+
+ if (get_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen)
+ ctx.optlen = max_optlen;
+
+ if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* BPF programs only allowed to set retval to 0, not some
+ * arbitrary value.
+ */
+ if (ctx.retval != 0 && ctx.retval != retval) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+ put_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = ctx.retval;
+
+out:
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
+
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
size_t *lenp)
{
@@ -1181,3 +1381,153 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
const struct bpf_prog_ops cg_sysctl_prog_ops = {
};
+
+static const struct bpf_func_proto *
+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif
+ default:
+ return cgroup_base_func_proto(func_id, prog);
+ }
+}
+
+static bool cg_sockopt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sockopt))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_GETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optname):
+ /* fallthrough */
+ case offsetof(struct bpf_sockopt, level):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_SETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optlen):
+ return size == size_default;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct bpf_sockopt, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+ default:
+ if (size != size_default)
+ return false;
+ break;
+ }
+ return true;
+}
+
+#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
+ T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sockopt, sk):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ break;
+ case offsetof(struct bpf_sockopt, level):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ break;
+ case offsetof(struct bpf_sockopt, optname):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ break;
+ case offsetof(struct bpf_sockopt, optlen):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
+ bool direct_write,
+ const struct bpf_prog *prog)
+{
+ /* Nothing to do for sockopt argument. The data is kzalloc'ated.
+ */
+ return 0;
+}
+
+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
+ .get_func_proto = cg_sockopt_func_proto,
+ .is_valid_access = cg_sockopt_is_valid_access,
+ .convert_ctx_access = cg_sockopt_convert_ctx_access,
+ .gen_prologue = cg_sockopt_get_prologue,
+};
+
+const struct bpf_prog_ops cg_sockopt_prog_ops = {
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ad3be85f1411..e2c1b43728da 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1809,6 +1809,15 @@ int bpf_prog_array_length(struct bpf_prog_array *array)
return cnt;
}
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
+{
+ struct bpf_prog_array_item *item;
+
+ for (item = array->items; item->prog; item++)
+ if (item->prog != &dummy_bpf_prog.prog)
+ return false;
+ return true;
+}
static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
u32 *prog_ids,
@@ -2101,3 +2110,4 @@ EXPORT_SYMBOL(bpf_stats_enabled_key);
#include <linux/bpf_trace.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8dff08768087..ef49e17ae47c 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -32,14 +32,19 @@
/* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is
- * guaranteed that setting flush bit and flush operation happen on
+ * guaranteed that queueing the frame and the flush operation happen on
* same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
* which queue in bpf_cpu_map_entry contains packets.
*/
#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct bpf_cpu_map_entry;
+struct bpf_cpu_map;
+
struct xdp_bulk_queue {
void *q[CPU_MAP_BULK_SIZE];
+ struct list_head flush_node;
+ struct bpf_cpu_map_entry *obj;
unsigned int count;
};
@@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
+ struct bpf_cpu_map *cmap;
+
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
@@ -65,23 +72,17 @@ struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map;
- unsigned long __percpu *flush_needed;
+ struct list_head __percpu *flush_list;
};
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
- struct xdp_bulk_queue *bq, bool in_napi_ctx);
-
-static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
-{
- return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
-}
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
+ int ret, cpu;
u64 cost;
- int ret;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
- cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+ cost += sizeof(struct list_head) * num_possible_cpus();
/* Notice returns -EPERM on if map size is larger than memlock limit */
ret = bpf_map_charge_init(&cmap->map.memory, cost);
@@ -114,12 +115,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
goto free_cmap;
}
- /* A per cpu bitfield with a bit per possible CPU in map */
- cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
- __alignof__(unsigned long));
- if (!cmap->flush_needed)
+ cmap->flush_list = alloc_percpu(struct list_head);
+ if (!cmap->flush_list)
goto free_charge;
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
+
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
@@ -129,7 +131,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map;
free_percpu:
- free_percpu(cmap->flush_needed);
+ free_percpu(cmap->flush_list);
free_charge:
bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
@@ -334,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
- int numa, err;
+ struct xdp_bulk_queue *bq;
+ int numa, err, i;
/* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu);
@@ -349,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
if (!rcpu->bulkq)
goto free_rcu;
+ for_each_possible_cpu(i) {
+ bq = per_cpu_ptr(rcpu->bulkq, i);
+ bq->obj = rcpu;
+ }
+
/* Alloc queue */
rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
if (!rcpu->queue)
@@ -405,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
/* No concurrent bq_enqueue can run at this point */
- bq_flush_to_queue(rcpu, bq, false);
+ bq_flush_to_queue(bq, false);
}
free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */
@@ -488,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
if (!rcpu)
return -ENOMEM;
+ rcpu->cmap = cmap;
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -514,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map)
synchronize_rcu();
/* To ensure all pending flush operations have completed wait for flush
- * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
- * Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new bits will be set.
+ * list be empty on _all_ cpus. Because the above synchronize_rcu()
+ * ensures the map is disconnected from the program we can assume no new
+ * items will be added to the list.
*/
for_each_online_cpu(cpu) {
- unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+ struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
- while (!bitmap_empty(bitmap, cmap->map.max_entries))
+ while (!list_empty(flush_list))
cond_resched();
}
@@ -538,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map)
/* bq flush and cleanup happens after RCU graze-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
- free_percpu(cmap->flush_needed);
+ free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map);
kfree(cmap);
}
@@ -590,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
};
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
- struct xdp_bulk_queue *bq, bool in_napi_ctx)
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
{
+ struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
const int to_cpu = rcpu->cpu;
struct ptr_ring *q;
@@ -621,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
bq->count = 0;
spin_unlock(&q->producer_lock);
+ __list_del_clearprev(&bq->flush_node);
+
/* Feedback loop via tracepoints */
trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
return 0;
@@ -631,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
*/
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
+ struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
- bq_flush_to_queue(rcpu, bq, true);
+ bq_flush_to_queue(bq, true);
/* Notice, xdp_buff/page MUST be queued here, long enough for
* driver to code invoking us to finished, due to driver
@@ -646,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
* operation, when completing napi->poll call.
*/
bq->q[bq->count++] = xdpf;
+
+ if (!bq->flush_node.prev)
+ list_add(&bq->flush_node, flush_list);
+
return 0;
}
@@ -665,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
-void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
-{
- struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
-
- __set_bit(bit, bitmap);
-}
-
void __cpu_map_flush(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
- u32 bit;
-
- /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
- * and __cpu_map_flush() happen on same CPU. Thus, the percpu
- * bitmap indicate which percpu bulkq have packets.
- */
- for_each_set_bit(bit, bitmap, map->max_entries) {
- struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
- struct xdp_bulk_queue *bq;
-
- /* This is possible if entry is removed by user space
- * between xdp redirect and flush op.
- */
- if (unlikely(!rcpu))
- continue;
-
- __clear_bit(bit, bitmap);
+ struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
+ struct xdp_bulk_queue *bq, *tmp;
- /* Flush all frames in bulkq to real queue */
- bq = this_cpu_ptr(rcpu->bulkq);
- bq_flush_to_queue(rcpu, bq, true);
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ bq_flush_to_queue(bq, true);
/* If already running, costs spin_lock_irqsave + smb_mb */
- wake_up_process(rcpu->kthread);
+ wake_up_process(bq->obj->kthread);
}
}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 40e86a7e0ef0..d83cf8ccc872 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -17,9 +17,8 @@
* datapath always has a valid copy. However, the datapath does a "flush"
* operation that pushes any pending packets in the driver outside the RCU
* critical section. Each bpf_dtab_netdev tracks these pending operations using
- * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
- * until all bits are cleared indicating outstanding flush operations have
- * completed.
+ * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until
+ * this list is empty, indicating outstanding flush operations have completed.
*
* BPF syscalls may race with BPF program calls on any of the update, delete
* or lookup operations. As noted above the xchg() operation also keep the
@@ -48,9 +47,13 @@
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16
+struct bpf_dtab_netdev;
+
struct xdp_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+ struct list_head flush_node;
struct net_device *dev_rx;
+ struct bpf_dtab_netdev *obj;
unsigned int count;
};
@@ -65,23 +68,18 @@ struct bpf_dtab_netdev {
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map;
- unsigned long __percpu *flush_needed;
+ struct list_head __percpu *flush_list;
struct list_head list;
};
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
-static u64 dev_map_bitmap_size(const union bpf_attr *attr)
-{
- return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
-}
-
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
struct bpf_dtab *dtab;
+ int err, cpu;
u64 cost;
- int err;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@@ -91,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
+ /* Lookup returns a pointer straight to dev->ifindex, so make sure the
+ * verifier prevents writes from the BPF side
+ */
+ attr->map_flags |= BPF_F_RDONLY_PROG;
+
dtab = kzalloc(sizeof(*dtab), GFP_USER);
if (!dtab)
return ERR_PTR(-ENOMEM);
@@ -99,7 +102,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
- cost += dev_map_bitmap_size(attr) * num_possible_cpus();
+ cost += sizeof(struct list_head) * num_possible_cpus();
/* if map size is larger than memlock limit, reject it */
err = bpf_map_charge_init(&dtab->map.memory, cost);
@@ -108,28 +111,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
err = -ENOMEM;
- /* A per cpu bitfield with a bit per possible net device */
- dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
- __alignof__(unsigned long),
- GFP_KERNEL | __GFP_NOWARN);
- if (!dtab->flush_needed)
+ dtab->flush_list = alloc_percpu(struct list_head);
+ if (!dtab->flush_list)
goto free_charge;
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
+
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
- goto free_charge;
+ goto free_percpu;
spin_lock(&dev_map_lock);
list_add_tail_rcu(&dtab->list, &dev_map_list);
spin_unlock(&dev_map_lock);
return &dtab->map;
+
+free_percpu:
+ free_percpu(dtab->flush_list);
free_charge:
bpf_map_charge_finish(&dtab->map.memory);
free_dtab:
- free_percpu(dtab->flush_needed);
kfree(dtab);
return ERR_PTR(err);
}
@@ -158,14 +163,14 @@ static void dev_map_free(struct bpf_map *map)
rcu_barrier();
/* To ensure all pending flush operations have completed wait for flush
- * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * list to empty on _all_ cpus.
* Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new bits will be set.
+ * from the program we can assume no new items will be added.
*/
for_each_online_cpu(cpu) {
- unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+ struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
- while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ while (!list_empty(flush_list))
cond_resched();
}
@@ -181,7 +186,7 @@ static void dev_map_free(struct bpf_map *map)
kfree(dev);
}
- free_percpu(dtab->flush_needed);
+ free_percpu(dtab->flush_list);
bpf_map_area_free(dtab->netdev_map);
kfree(dtab);
}
@@ -203,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
-{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
-
- __set_bit(bit, bitmap);
-}
-
-static int bq_xmit_all(struct bpf_dtab_netdev *obj,
- struct xdp_bulk_queue *bq, u32 flags,
+static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
bool in_napi_ctx)
{
+ struct bpf_dtab_netdev *obj = bq->obj;
struct net_device *dev = obj->dev;
int sent = 0, drops = 0, err = 0;
int i;
@@ -241,6 +238,7 @@ out:
trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
sent, drops, bq->dev_rx, dev, err);
bq->dev_rx = NULL;
+ __list_del_clearprev(&bq->flush_node);
return 0;
error:
/* If ndo_xdp_xmit fails with an errno, no frames have been
@@ -263,31 +261,18 @@ error:
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
- * net device can be torn down. On devmap tear down we ensure the ctx bitmap
- * is zeroed before completing to ensure all flush operations have completed.
+ * net device can be torn down. On devmap tear down we ensure the flush list
+ * is empty before completing to ensure all flush operations have completed.
*/
void __dev_map_flush(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
- u32 bit;
+ struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
+ struct xdp_bulk_queue *bq, *tmp;
rcu_read_lock();
- for_each_set_bit(bit, bitmap, map->max_entries) {
- struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
- struct xdp_bulk_queue *bq;
-
- /* This is possible if the dev entry is removed by user space
- * between xdp redirect and flush op.
- */
- if (unlikely(!dev))
- continue;
-
- bq = this_cpu_ptr(dev->bulkq);
- bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
-
- __clear_bit(bit, bitmap);
- }
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+ bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
rcu_read_unlock();
}
@@ -314,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
+ struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
- bq_xmit_all(obj, bq, 0, true);
+ bq_xmit_all(bq, 0, true);
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
@@ -327,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
bq->dev_rx = dev_rx;
bq->q[bq->count++] = xdpf;
+
+ if (!bq->flush_node.prev)
+ list_add(&bq->flush_node, flush_list);
+
return 0;
}
@@ -377,17 +367,12 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
if (dev->dev->netdev_ops->ndo_xdp_xmit) {
struct xdp_bulk_queue *bq;
- unsigned long *bitmap;
-
int cpu;
rcu_read_lock();
for_each_online_cpu(cpu) {
- bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
- __clear_bit(dev->bit, bitmap);
-
bq = per_cpu_ptr(dev->bulkq, cpu);
- bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
+ bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
}
rcu_read_unlock();
}
@@ -434,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
struct net *net = current->nsproxy->net_ns;
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev, *old_dev;
- u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value;
+ struct xdp_bulk_queue *bq;
+ u32 i = *(u32 *)key;
+ int cpu;
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
@@ -458,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
return -ENOMEM;
}
+ for_each_possible_cpu(cpu) {
+ bq = per_cpu_ptr(dev->bulkq, cpu);
+ bq->obj = dev;
+ }
+
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
free_percpu(dev->bulkq);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7713cf39795a..b0f545e07425 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1590,6 +1590,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
default:
return -EINVAL;
}
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_SETSOCKOPT:
+ case BPF_CGROUP_GETSOCKOPT:
+ return 0;
+ default:
+ return -EINVAL;
+ }
default:
return 0;
}
@@ -1840,6 +1848,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
return prog->enforce_expected_attach_type &&
@@ -1912,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_SYSCTL:
ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
break;
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+ break;
default:
return -EINVAL;
}
@@ -1995,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_SYSCTL:
ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
break;
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+ break;
default:
return -EINVAL;
}
@@ -2031,6 +2048,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
case BPF_CGROUP_SYSCTL:
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
break;
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e079b2298f8..a2e763703c30 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1659,16 +1659,18 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
}
}
-static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
+ int spi)
{
struct bpf_verifier_state *st = env->cur_state;
int first_idx = st->first_insn_idx;
int last_idx = env->insn_idx;
struct bpf_func_state *func;
struct bpf_reg_state *reg;
- u32 reg_mask = 1u << regno;
- u64 stack_mask = 0;
+ u32 reg_mask = regno >= 0 ? 1u << regno : 0;
+ u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
bool skip_first = true;
+ bool new_marks = false;
int i, err;
if (!env->allow_ptr_leaks)
@@ -1676,18 +1678,43 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
return 0;
func = st->frame[st->curframe];
- reg = &func->regs[regno];
- if (reg->type != SCALAR_VALUE) {
- WARN_ONCE(1, "backtracing misuse");
- return -EFAULT;
+ if (regno >= 0) {
+ reg = &func->regs[regno];
+ if (reg->type != SCALAR_VALUE) {
+ WARN_ONCE(1, "backtracing misuse");
+ return -EFAULT;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ else
+ reg_mask = 0;
+ reg->precise = true;
+ }
+
+ while (spi >= 0) {
+ if (func->stack[spi].slot_type[0] != STACK_SPILL) {
+ stack_mask = 0;
+ break;
+ }
+ reg = &func->stack[spi].spilled_ptr;
+ if (reg->type != SCALAR_VALUE) {
+ stack_mask = 0;
+ break;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ else
+ stack_mask = 0;
+ reg->precise = true;
+ break;
}
- if (reg->precise)
- return 0;
- func->regs[regno].precise = true;
+ if (!new_marks)
+ return 0;
+ if (!reg_mask && !stack_mask)
+ return 0;
for (;;) {
DECLARE_BITMAP(mask, 64);
- bool new_marks = false;
u32 history = st->jmp_history_cnt;
if (env->log.level & BPF_LOG_LEVEL)
@@ -1730,12 +1757,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
if (!st)
break;
+ new_marks = false;
func = st->frame[st->curframe];
bitmap_from_u64(mask, reg_mask);
for_each_set_bit(i, mask, 32) {
reg = &func->regs[i];
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE) {
+ reg_mask &= ~(1u << i);
continue;
+ }
if (!reg->precise)
new_marks = true;
reg->precise = true;
@@ -1756,11 +1786,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
return -EFAULT;
}
- if (func->stack[i].slot_type[0] != STACK_SPILL)
+ if (func->stack[i].slot_type[0] != STACK_SPILL) {
+ stack_mask &= ~(1ull << i);
continue;
+ }
reg = &func->stack[i].spilled_ptr;
- if (reg->type != SCALAR_VALUE)
+ if (reg->type != SCALAR_VALUE) {
+ stack_mask &= ~(1ull << i);
continue;
+ }
if (!reg->precise)
new_marks = true;
reg->precise = true;
@@ -1772,6 +1806,8 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
reg_mask, stack_mask);
}
+ if (!reg_mask && !stack_mask)
+ break;
if (!new_marks)
break;
@@ -1781,6 +1817,15 @@ static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
return 0;
}
+static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+{
+ return __mark_chain_precision(env, regno, -1);
+}
+
+static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+{
+ return __mark_chain_precision(env, -1, spi);
+}
static bool is_spillable_regtype(enum bpf_reg_type type)
{
@@ -2215,6 +2260,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
env->seen_direct_write = true;
return true;
+
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ if (t == BPF_WRITE)
+ env->seen_direct_write = true;
+
+ return true;
+
default:
return false;
}
@@ -3407,12 +3459,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (func_id != BPF_FUNC_get_local_storage)
goto error;
break;
- /* devmap returns a pointer to a live net_device ifindex that we cannot
- * allow to be modified from bpf side. So do not allow lookup elements
- * for now.
- */
case BPF_MAP_TYPE_DEVMAP:
- if (func_id != BPF_FUNC_redirect_map)
+ if (func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
/* Restrict bpf side of cpumap and xskmap, open when use-cases
@@ -6066,6 +6115,7 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
break;
default:
return 0;
@@ -7106,6 +7156,46 @@ static int propagate_liveness(struct bpf_verifier_env *env,
return 0;
}
+/* find precise scalars in the previous equivalent state and
+ * propagate them into the current state
+ */
+static int propagate_precision(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *old)
+{
+ struct bpf_reg_state *state_reg;
+ struct bpf_func_state *state;
+ int i, err = 0;
+
+ state = old->frame[old->curframe];
+ state_reg = state->regs;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "propagating r%d\n", i);
+ err = mark_chain_precision(env, i);
+ if (err < 0)
+ return err;
+ }
+
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "propagating fp%d\n",
+ (-i - 1) * BPF_REG_SIZE);
+ err = mark_chain_precision_stack(env, i);
+ if (err < 0)
+ return err;
+ }
+ return 0;
+}
+
static bool states_maybe_looping(struct bpf_verifier_state *old,
struct bpf_verifier_state *cur)
{
@@ -7198,6 +7288,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* this state and will pop a new one.
*/
err = propagate_liveness(env, &sl->state, cur);
+
+ /* if previous state reached the exit with precision and
+ * current state is equivalent to it (except precsion marks)
+ * the precision needs to be propagated back in
+ * the current state.
+ */
+ err = err ? : push_jmp_history(env, cur);
+ err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
return 1;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index ef7338cebd18..9bb96ace9fa1 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -145,8 +145,7 @@ void __xsk_map_flush(struct bpf_map *map)
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
xsk_flush(xs);
- __list_del(xs->flush_node.prev, xs->flush_node.next);
- xs->flush_node.prev = NULL;
+ __list_del_clearprev(&xs->flush_node);
}
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c102c240bb0b..ca1255d14576 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1431,6 +1431,20 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
return err;
}
+static int __init send_signal_irq_work_init(void)
+{
+ int cpu;
+ struct send_signal_irq_work *work;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&send_signal_work, cpu);
+ init_irq_work(&work->irq_work, do_bpf_send_signal);
+ }
+ return 0;
+}
+
+subsys_initcall(send_signal_irq_work_init);
+
#ifdef CONFIG_MODULES
static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
void *module)
@@ -1478,18 +1492,5 @@ static int __init bpf_event_init(void)
return 0;
}
-static int __init send_signal_irq_work_init(void)
-{
- int cpu;
- struct send_signal_irq_work *work;
-
- for_each_possible_cpu(cpu) {
- work = per_cpu_ptr(&send_signal_work, cpu);
- init_irq_work(&work->irq_work, do_bpf_send_signal);
- }
- return 0;
-}
-
fs_initcall(bpf_event_init);
-subsys_initcall(send_signal_irq_work_init);
#endif /* CONFIG_MODULES */
diff --git a/net/core/filter.c b/net/core/filter.c
index 2014d76e0d2a..089aaea0ccc6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2158,8 +2158,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
if (unlikely(flags & ~(BPF_F_INGRESS)))
return TC_ACT_SHOT;
- ri->ifindex = ifindex;
ri->flags = flags;
+ ri->tgt_index = ifindex;
return TC_ACT_REDIRECT;
}
@@ -2169,8 +2169,8 @@ int skb_do_redirect(struct sk_buff *skb)
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net_device *dev;
- dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
- ri->ifindex = 0;
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
+ ri->tgt_index = 0;
if (unlikely(!dev)) {
kfree_skb(skb);
return -EINVAL;
@@ -3488,11 +3488,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
{
struct net_device *fwd;
- u32 index = ri->ifindex;
+ u32 index = ri->tgt_index;
int err;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
- ri->ifindex = 0;
+ ri->tgt_index = 0;
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
@@ -3523,7 +3523,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
err = dev_map_enqueue(dst, xdp, dev_rx);
if (unlikely(err))
return err;
- __dev_map_insert_ctx(map, index);
break;
}
case BPF_MAP_TYPE_CPUMAP: {
@@ -3532,7 +3531,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
err = cpu_map_enqueue(rcpu, xdp, dev_rx);
if (unlikely(err))
return err;
- __cpu_map_insert_ctx(map, index);
break;
}
case BPF_MAP_TYPE_XSKMAP: {
@@ -3606,18 +3604,14 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_map *map,
struct bpf_redirect_info *ri)
{
- u32 index = ri->ifindex;
- void *fwd = NULL;
+ u32 index = ri->tgt_index;
+ void *fwd = ri->tgt_value;
int err;
- ri->ifindex = 0;
+ ri->tgt_index = 0;
+ ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
- fwd = __xdp_map_lookup_elem(map, index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
xdp_do_flush_map();
@@ -3653,19 +3647,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
struct bpf_map *map)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- u32 index = ri->ifindex;
- void *fwd = NULL;
+ u32 index = ri->tgt_index;
+ void *fwd = ri->tgt_value;
int err = 0;
- ri->ifindex = 0;
+ ri->tgt_index = 0;
+ ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
- fwd = __xdp_map_lookup_elem(map, index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
-
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
struct bpf_dtab_netdev *dst = fwd;
@@ -3697,14 +3686,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->ifindex;
+ u32 index = ri->tgt_index;
struct net_device *fwd;
int err = 0;
if (map)
return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
map);
- ri->ifindex = 0;
+ ri->tgt_index = 0;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
if (unlikely(!fwd)) {
err = -EINVAL;
@@ -3732,8 +3721,9 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
if (unlikely(flags))
return XDP_ABORTED;
- ri->ifindex = ifindex;
ri->flags = flags;
+ ri->tgt_index = ifindex;
+ ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
return XDP_REDIRECT;
@@ -3752,11 +3742,23 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- if (unlikely(flags))
+ /* Lower bits of the flags are used as return code on lookup failure */
+ if (unlikely(flags > XDP_TX))
return XDP_ABORTED;
- ri->ifindex = ifindex;
+ ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
+ if (unlikely(!ri->tgt_value)) {
+ /* If the lookup fails we want to clear out the state in the
+ * redirect_info struct completely, so that if an eBPF program
+ * performs multiple lookups, the last one always takes
+ * precedence.
+ */
+ WRITE_ONCE(ri->map, NULL);
+ return flags;
+ }
+
ri->flags = flags;
+ ri->tgt_index = ifindex;
WRITE_ONCE(ri->map, map);
return XDP_REDIRECT;
@@ -5192,54 +5194,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
};
#endif /* CONFIG_IPV6_SEG6_BPF */
-#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \
-do { \
- switch (si->off) { \
- case offsetof(md_type, snd_cwnd): \
- CONVERT(snd_cwnd); break; \
- case offsetof(md_type, srtt_us): \
- CONVERT(srtt_us); break; \
- case offsetof(md_type, snd_ssthresh): \
- CONVERT(snd_ssthresh); break; \
- case offsetof(md_type, rcv_nxt): \
- CONVERT(rcv_nxt); break; \
- case offsetof(md_type, snd_nxt): \
- CONVERT(snd_nxt); break; \
- case offsetof(md_type, snd_una): \
- CONVERT(snd_una); break; \
- case offsetof(md_type, mss_cache): \
- CONVERT(mss_cache); break; \
- case offsetof(md_type, ecn_flags): \
- CONVERT(ecn_flags); break; \
- case offsetof(md_type, rate_delivered): \
- CONVERT(rate_delivered); break; \
- case offsetof(md_type, rate_interval_us): \
- CONVERT(rate_interval_us); break; \
- case offsetof(md_type, packets_out): \
- CONVERT(packets_out); break; \
- case offsetof(md_type, retrans_out): \
- CONVERT(retrans_out); break; \
- case offsetof(md_type, total_retrans): \
- CONVERT(total_retrans); break; \
- case offsetof(md_type, segs_in): \
- CONVERT(segs_in); break; \
- case offsetof(md_type, data_segs_in): \
- CONVERT(data_segs_in); break; \
- case offsetof(md_type, segs_out): \
- CONVERT(segs_out); break; \
- case offsetof(md_type, data_segs_out): \
- CONVERT(data_segs_out); break; \
- case offsetof(md_type, lost_out): \
- CONVERT(lost_out); break; \
- case offsetof(md_type, sacked_out): \
- CONVERT(sacked_out); break; \
- case offsetof(md_type, bytes_received): \
- CONVERT(bytes_received); break; \
- case offsetof(md_type, bytes_acked): \
- CONVERT(bytes_acked); break; \
- } \
-} while (0)
-
#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
@@ -5590,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
+ if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
+ icsk_retransmits))
return false;
if (off % size != 0)
@@ -5621,8 +5576,19 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, FIELD)); \
} while (0)
- CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock,
- BPF_TCP_SOCK_GET_COMMON);
+#define BPF_INET_SOCK_GET_COMMON(FIELD) \
+ do { \
+ BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \
+ FIELD) > \
+ FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct inet_connection_sock, \
+ FIELD), \
+ si->dst_reg, si->src_reg, \
+ offsetof( \
+ struct inet_connection_sock, \
+ FIELD)); \
+ } while (0)
if (insn > insn_buf)
return insn - insn_buf;
@@ -5638,6 +5604,81 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, rtt_min) +
offsetof(struct minmax_sample, v));
break;
+ case offsetof(struct bpf_tcp_sock, snd_cwnd):
+ BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
+ break;
+ case offsetof(struct bpf_tcp_sock, srtt_us):
+ BPF_TCP_SOCK_GET_COMMON(srtt_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_ssthresh):
+ BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
+ break;
+ case offsetof(struct bpf_tcp_sock, rcv_nxt):
+ BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_nxt):
+ BPF_TCP_SOCK_GET_COMMON(snd_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_una):
+ BPF_TCP_SOCK_GET_COMMON(snd_una);
+ break;
+ case offsetof(struct bpf_tcp_sock, mss_cache):
+ BPF_TCP_SOCK_GET_COMMON(mss_cache);
+ break;
+ case offsetof(struct bpf_tcp_sock, ecn_flags):
+ BPF_TCP_SOCK_GET_COMMON(ecn_flags);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_delivered):
+ BPF_TCP_SOCK_GET_COMMON(rate_delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_interval_us):
+ BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, packets_out):
+ BPF_TCP_SOCK_GET_COMMON(packets_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, retrans_out):
+ BPF_TCP_SOCK_GET_COMMON(retrans_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, total_retrans):
+ BPF_TCP_SOCK_GET_COMMON(total_retrans);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_in):
+ BPF_TCP_SOCK_GET_COMMON(segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_in):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_out):
+ BPF_TCP_SOCK_GET_COMMON(segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_out):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, lost_out):
+ BPF_TCP_SOCK_GET_COMMON(lost_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, sacked_out):
+ BPF_TCP_SOCK_GET_COMMON(sacked_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_received):
+ BPF_TCP_SOCK_GET_COMMON(bytes_received);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_acked):
+ BPF_TCP_SOCK_GET_COMMON(bytes_acked);
+ break;
+ case offsetof(struct bpf_tcp_sock, dsack_dups):
+ BPF_TCP_SOCK_GET_COMMON(dsack_dups);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered):
+ BPF_TCP_SOCK_GET_COMMON(delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered_ce):
+ BPF_TCP_SOCK_GET_COMMON(delivered_ce);
+ break;
+ case offsetof(struct bpf_tcp_sock, icsk_retransmits):
+ BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
+ break;
}
return insn - insn_buf;
@@ -5651,7 +5692,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
return (unsigned long)NULL;
}
-static const struct bpf_func_proto bpf_tcp_sock_proto = {
+const struct bpf_func_proto bpf_tcp_sock_proto = {
.func = bpf_tcp_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
@@ -7911,9 +7952,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
} while (0)
- CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops,
- SOCK_OPS_GET_TCP_SOCK_FIELD);
-
if (insn > insn_buf)
return insn - insn_buf;
@@ -8083,6 +8121,69 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
struct sock, type);
break;
+ case offsetof(struct bpf_sock_ops, snd_cwnd):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
+ break;
+ case offsetof(struct bpf_sock_ops, srtt_us):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_ssthresh):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
+ break;
+ case offsetof(struct bpf_sock_ops, rcv_nxt):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_nxt):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_una):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
+ break;
+ case offsetof(struct bpf_sock_ops, mss_cache):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
+ break;
+ case offsetof(struct bpf_sock_ops, ecn_flags):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
+ break;
+ case offsetof(struct bpf_sock_ops, rate_delivered):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
+ break;
+ case offsetof(struct bpf_sock_ops, rate_interval_us):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
+ break;
+ case offsetof(struct bpf_sock_ops, packets_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
+ break;
+ case offsetof(struct bpf_sock_ops, retrans_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
+ break;
+ case offsetof(struct bpf_sock_ops, total_retrans):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
+ break;
+ case offsetof(struct bpf_sock_ops, segs_in):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
+ break;
+ case offsetof(struct bpf_sock_ops, data_segs_in):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
+ break;
+ case offsetof(struct bpf_sock_ops, segs_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
+ break;
+ case offsetof(struct bpf_sock_ops, data_segs_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
+ break;
+ case offsetof(struct bpf_sock_ops, lost_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
+ break;
+ case offsetof(struct bpf_sock_ops, sacked_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
+ break;
+ case offsetof(struct bpf_sock_ops, bytes_received):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
+ break;
+ case offsetof(struct bpf_sock_ops, bytes_acked):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
+ break;
case offsetof(struct bpf_sock_ops, sk):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern,
diff --git a/net/core/xdp.c b/net/core/xdp.c
index b29d7b513a18..829377cc83db 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -85,7 +85,7 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
kfree(xa);
}
-bool __mem_id_disconnect(int id, bool force)
+static bool __mem_id_disconnect(int id, bool force)
{
struct xdp_mem_allocator *xa;
bool safe_to_remove = true;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b71efeb0ae5b..c21e8a22fb3b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt;
tp->mdev_max_us = tcp_rto_min_us(sk);
+
+ tcp_bpf_rtt(sk);
}
} else {
/* no previous measure. */
@@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
+
+ tcp_bpf_rtt(sk);
}
tp->srtt_us = max(1U, srtt);
}
diff --git a/net/socket.c b/net/socket.c
index a865708940f9..d97b74f762e8 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2050,6 +2050,8 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size,
static int __sys_setsockopt(int fd, int level, int optname,
char __user *optval, int optlen)
{
+ mm_segment_t oldfs = get_fs();
+ char *kernel_optval = NULL;
int err, fput_needed;
struct socket *sock;
@@ -2062,6 +2064,22 @@ static int __sys_setsockopt(int fd, int level, int optname,
if (err)
goto out_put;
+ err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level,
+ &optname, optval, &optlen,
+ &kernel_optval);
+
+ if (err < 0) {
+ goto out_put;
+ } else if (err > 0) {
+ err = 0;
+ goto out_put;
+ }
+
+ if (kernel_optval) {
+ set_fs(KERNEL_DS);
+ optval = (char __user __force *)kernel_optval;
+ }
+
if (level == SOL_SOCKET)
err =
sock_setsockopt(sock, level, optname, optval,
@@ -2070,6 +2088,11 @@ static int __sys_setsockopt(int fd, int level, int optname,
err =
sock->ops->setsockopt(sock, level, optname, optval,
optlen);
+
+ if (kernel_optval) {
+ set_fs(oldfs);
+ kfree(kernel_optval);
+ }
out_put:
fput_light(sock->file, fput_needed);
}
@@ -2092,6 +2115,7 @@ static int __sys_getsockopt(int fd, int level, int optname,
{
int err, fput_needed;
struct socket *sock;
+ int max_optlen;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock != NULL) {
@@ -2099,6 +2123,8 @@ static int __sys_getsockopt(int fd, int level, int optname,
if (err)
goto out_put;
+ max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+
if (level == SOL_SOCKET)
err =
sock_getsockopt(sock, level, optname, optval,
@@ -2107,6 +2133,10 @@ static int __sys_getsockopt(int fd, int level, int optname,
err =
sock->ops->getsockopt(sock, level, optname, optval,
optlen);
+
+ err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
+ optval, optlen,
+ max_optlen, err);
out_put:
fput_light(sock->file, fput_needed);
}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index a14e8864e4fa..74417a851ed5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -37,6 +37,12 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
READ_ONCE(xs->umem->fq);
}
+bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
+{
+ return xskq_has_addrs(umem->fq, cnt);
+}
+EXPORT_SYMBOL(xsk_umem_has_addrs);
+
u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
return xskq_peek_addr(umem->fq, addr);
@@ -166,22 +172,18 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
}
EXPORT_SYMBOL(xsk_umem_consume_tx_done);
-bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
+bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
{
- struct xdp_desc desc;
struct xdp_sock *xs;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
- if (!xskq_peek_desc(xs->tx, &desc))
+ if (!xskq_peek_desc(xs->tx, desc))
continue;
- if (xskq_produce_addr_lazy(umem->cq, desc.addr))
+ if (xskq_produce_addr_lazy(umem->cq, desc->addr))
goto out;
- *dma = xdp_umem_get_dma(umem, desc.addr);
- *len = desc.len;
-
xskq_discard_desc(xs->tx);
rcu_read_unlock();
return true;
@@ -644,6 +646,26 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
return 0;
}
+ case XDP_OPTIONS:
+ {
+ struct xdp_options opts = {};
+
+ if (len < sizeof(opts))
+ return -EINVAL;
+
+ mutex_lock(&xs->mutex);
+ if (xs->zc)
+ opts.flags |= XDP_OPTIONS_ZEROCOPY;
+ mutex_unlock(&xs->mutex);
+
+ len = sizeof(opts);
+ if (copy_to_user(optval, &opts, len))
+ return -EFAULT;
+ if (put_user(len, optlen))
+ return -EFAULT;
+
+ return 0;
+ }
default:
break;
}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 88b9ae24658d..12b49784a6d5 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -117,6 +117,20 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
return q->nentries - (producer - q->cons_tail);
}
+static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
+{
+ u32 entries = q->prod_tail - q->cons_tail;
+
+ if (entries >= cnt)
+ return true;
+
+ /* Refresh the local pointer. */
+ q->prod_tail = READ_ONCE(q->ring->producer);
+ entries = q->prod_tail - q->cons_tail;
+
+ return entries >= cnt;
+}
+
/* UMEM queue */
static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 0917f8cf4fab..f90daadfbc89 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -154,6 +154,7 @@ always += tcp_iw_kern.o
always += tcp_clamp_kern.o
always += tcp_basertt_kern.o
always += tcp_tos_reflect_kern.o
+always += tcp_dumpstats_kern.o
always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
always += xdp_redirect_cpu_kern.o
@@ -168,6 +169,7 @@ always += task_fd_query_kern.o
always += xdp_sample_pkts_kern.o
always += ibumad_kern.o
always += hbm_out_kern.o
+always += hbm_edt_kern.o
KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/
@@ -272,6 +274,7 @@ $(src)/*.c: verify_target_bpf $(LIBBPF)
$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
$(obj)/hbm.o: $(src)/hbm.h
+$(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
# But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
index e48b047d4646..ffe4c0607341 100755
--- a/samples/bpf/do_hbm_test.sh
+++ b/samples/bpf/do_hbm_test.sh
@@ -14,7 +14,7 @@ Usage() {
echo "loads. The output is the goodput in Mbps (unless -D was used)."
echo ""
echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
- echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E]"
+ echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]"
echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
@@ -30,6 +30,7 @@ Usage() {
echo " other detailed information. This information is"
echo " test dependent (i.e. iperf3 or netperf)."
echo " -E enable ECN (not required for dctcp)"
+ echo " --edt use fq's Earliest Departure Time (requires fq)"
echo " -f or --flows number of concurrent flows (default=1)"
echo " -i or --id cgroup id (an integer, default is 1)"
echo " -N use netperf instead of iperf3"
@@ -130,13 +131,12 @@ processArgs () {
details=1
;;
-E)
- ecn=1
+ ecn=1
+ ;;
+ --edt)
+ flags="$flags --edt"
+ qdisc="fq"
;;
- # Support for upcomming fq Early Departure Time egress rate limiting
- #--edt)
- # prog="hbm_out_edt_kern.o"
- # qdisc="fq"
- # ;;
-f=*|--flows=*)
flows="${i#*=}"
;;
@@ -228,8 +228,8 @@ if [ "$netem" -ne "0" ] ; then
tc qdisc del dev lo root > /dev/null 2>&1
tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
elif [ "$qdisc" != "" ] ; then
- tc qdisc del dev lo root > /dev/null 2>&1
- tc qdisc add dev lo root $qdisc > /dev/null 2>&1
+ tc qdisc del dev eth0 root > /dev/null 2>&1
+ tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1
fi
n=0
@@ -399,7 +399,9 @@ fi
if [ "$netem" -ne "0" ] ; then
tc qdisc del dev lo root > /dev/null 2>&1
fi
-
+if [ "$qdisc" != "" ] ; then
+ tc qdisc del dev eth0 root > /dev/null 2>&1
+fi
sleep 2
hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index b905b32ff185..e0fbab9bec83 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -62,6 +62,7 @@ bool loopback_flag;
bool debugFlag;
bool work_conserving_flag;
bool no_cn_flag;
+bool edt_flag;
static void Usage(void);
static void read_trace_pipe2(void);
@@ -372,9 +373,14 @@ static int run_bpf_prog(char *prog, int cg_id)
fprintf(fout, "avg rtt:%d\n",
(int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
// Average credit
- fprintf(fout, "avg credit:%d\n",
- (int)(qstats.sum_credit /
- (1500 * ((int)qstats.pkts_total) + 1)));
+ if (edt_flag)
+ fprintf(fout, "avg credit_ms:%.03f\n",
+ (qstats.sum_credit /
+ (qstats.pkts_total + 1.0)) / 1000000.0);
+ else
+ fprintf(fout, "avg credit:%d\n",
+ (int)(qstats.sum_credit /
+ (1500 * ((int)qstats.pkts_total ) + 1)));
// Return values stats
for (k = 0; k < RET_VAL_COUNT; k++) {
@@ -408,6 +414,7 @@ static void Usage(void)
" Where:\n"
" -o indicates egress direction (default)\n"
" -d print BPF trace debug buffer\n"
+ " --edt use fq's Earliest Departure Time\n"
" -l also limit flows using loopback\n"
" -n <#> to create cgroup \"/hbm#\" and attach prog\n"
" Default is /hbm1\n"
@@ -433,6 +440,7 @@ int main(int argc, char **argv)
char *optstring = "iodln:r:st:wh";
struct option loptions[] = {
{"no_cn", 0, NULL, 1},
+ {"edt", 0, NULL, 2},
{NULL, 0, NULL, 0}
};
@@ -441,6 +449,10 @@ int main(int argc, char **argv)
case 1:
no_cn_flag = true;
break;
+ case 2:
+ prog = "hbm_edt_kern.o";
+ edt_flag = true;
+ break;
case'o':
break;
case 'd':
diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c
new file mode 100644
index 000000000000..a65b677acdb0
--- /dev/null
+++ b/samples/bpf/hbm_edt_kern.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ * CREDIT
+ * - <--------------------------|------------------------> +
+ * | | | 0
+ * | Large pkt |
+ * | drop thresh |
+ * Small pkt drop Mark threshold
+ * thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ * is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ * to reduce the congestion window. The current implementation uses a linear
+ * distribution (0% probability at marking threshold, 100% probability
+ * at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * by a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * a user program, such as hbm, can access.
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+ long long delta = 0, delta_send;
+ unsigned long long curtime, sendtime;
+ struct hbm_queue_stats *qsp = NULL;
+ unsigned int queue_index = 0;
+ bool congestion_flag = false;
+ bool ecn_ce_flag = false;
+ struct hbm_pkt_info pkti = {};
+ struct hbm_vqueue *qdp;
+ bool drop_flag = false;
+ bool cwr_flag = false;
+ int len = skb->len;
+ int rv = ALLOW_PKT;
+
+ qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+
+ // Check if we should ignore loopback traffic
+ if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+ return ALLOW_PKT;
+
+ hbm_get_pkt_info(skb, &pkti);
+
+ // We may want to account for the length of headers in len
+ // calculation, like ETH header + overhead, specially if it
+ // is a gso packet. But I am not doing it right now.
+
+ qdp = bpf_get_local_storage(&queue_state, 0);
+ if (!qdp)
+ return ALLOW_PKT;
+ if (qdp->lasttime == 0)
+ hbm_init_edt_vqueue(qdp, 1024);
+
+ curtime = bpf_ktime_get_ns();
+
+ // Begin critical section
+ bpf_spin_lock(&qdp->lock);
+ delta = qdp->lasttime - curtime;
+ // bound bursts to 100us
+ if (delta < -BURST_SIZE_NS) {
+ // negative delta is a credit that allows bursts
+ qdp->lasttime = curtime - BURST_SIZE_NS;
+ delta = -BURST_SIZE_NS;
+ }
+ sendtime = qdp->lasttime;
+ delta_send = BYTES_TO_NS(len, qdp->rate);
+ __sync_add_and_fetch(&(qdp->lasttime), delta_send);
+ bpf_spin_unlock(&qdp->lock);
+ // End critical section
+
+ // Set EDT of packet
+ skb->tstamp = sendtime;
+
+ // Check if we should update rate
+ if (qsp != NULL && (qsp->rate * 128) != qdp->rate)
+ qdp->rate = qsp->rate * 128;
+
+ // Set flags (drop, congestion, cwr)
+ // last packet will be sent in the future, bound latency
+ if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS &&
+ len > LARGE_PKT_THRESH)) {
+ drop_flag = true;
+ if (pkti.is_tcp && pkti.ecn == 0)
+ cwr_flag = true;
+ } else if (delta > MARK_THRESH_NS) {
+ if (pkti.is_tcp)
+ congestion_flag = true;
+ else
+ drop_flag = true;
+ }
+
+ if (congestion_flag) {
+ if (bpf_skb_ecn_set_ce(skb)) {
+ ecn_ce_flag = true;
+ } else {
+ if (pkti.is_tcp) {
+ unsigned int rand = bpf_get_prandom_u32();
+
+ if (delta >= MARK_THRESH_NS +
+ (rand % MARK_REGION_SIZE_NS)) {
+ // Do congestion control
+ cwr_flag = true;
+ }
+ } else if (len > LARGE_PKT_THRESH) {
+ // Problem if too many small packets?
+ drop_flag = true;
+ congestion_flag = false;
+ }
+ }
+ }
+
+ if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) {
+ drop_flag = false;
+ cwr_flag = true;
+ congestion_flag = false;
+ }
+
+ if (qsp != NULL && qsp->no_cn)
+ cwr_flag = false;
+
+ hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
+ cwr_flag, ecn_ce_flag, &pkti, (int) delta);
+
+ if (drop_flag) {
+ __sync_add_and_fetch(&(qdp->lasttime), -delta_send);
+ rv = DROP_PKT;
+ }
+
+ if (cwr_flag)
+ rv |= CWR;
+ return rv;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
index be19cf1d5cd5..aa207a2eebbd 100644
--- a/samples/bpf/hbm_kern.h
+++ b/samples/bpf/hbm_kern.h
@@ -29,6 +29,7 @@
#define DROP_PKT 0
#define ALLOW_PKT 1
#define TCP_ECN_OK 1
+#define CWR 2
#ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging
#undef bpf_printk
@@ -45,8 +46,18 @@
#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET)
#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
+// Time base accounting for fq's EDT
+#define BURST_SIZE_NS 100000 // 100us
+#define MARK_THRESH_NS 50000 // 50us
+#define DROP_THRESH_NS 500000 // 500us
+// Reserve 20us of queuing for small packets (less than 120 bytes)
+#define LARGE_PKT_DROP_THRESH_NS (DROP_THRESH_NS - 20000)
+#define MARK_REGION_SIZE_NS (LARGE_PKT_DROP_THRESH_NS - MARK_THRESH_NS)
+
// rate in bytes per ns << 20
#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+#define BYTES_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+#define BYTES_TO_NS(bytes, rate) div64_u64(((u64)(bytes)) << 20, (u64)(rate))
struct bpf_map_def SEC("maps") queue_state = {
.type = BPF_MAP_TYPE_CGROUP_STORAGE,
@@ -67,6 +78,7 @@ BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
struct hbm_pkt_info {
int cwnd;
int rtt;
+ int packets_out;
bool is_ip;
bool is_tcp;
short ecn;
@@ -86,16 +98,20 @@ static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti)
if (tp) {
pkti->cwnd = tp->snd_cwnd;
pkti->rtt = tp->srtt_us >> 3;
+ pkti->packets_out = tp->packets_out;
return 0;
}
}
}
}
+ pkti->cwnd = 0;
+ pkti->rtt = 0;
+ pkti->packets_out = 0;
return 1;
}
-static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
- struct hbm_pkt_info *pkti)
+static void hbm_get_pkt_info(struct __sk_buff *skb,
+ struct hbm_pkt_info *pkti)
{
struct iphdr iph;
struct ipv6hdr *ip6h;
@@ -123,10 +139,22 @@ static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
{
- bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
- qdp->lasttime = bpf_ktime_get_ns();
- qdp->credit = INIT_CREDIT;
- qdp->rate = rate * 128;
+ bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+ qdp->lasttime = bpf_ktime_get_ns();
+ qdp->credit = INIT_CREDIT;
+ qdp->rate = rate * 128;
+}
+
+static __always_inline void hbm_init_edt_vqueue(struct hbm_vqueue *qdp,
+ int rate)
+{
+ unsigned long long curtime;
+
+ curtime = bpf_ktime_get_ns();
+ bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+ qdp->lasttime = curtime - BURST_SIZE_NS; // support initial burst
+ qdp->credit = 0; // not used
+ qdp->rate = rate * 128;
}
static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c
index 38b2b3f22049..f281df7e0089 100644
--- a/samples/bpf/ibumad_kern.c
+++ b/samples/bpf/ibumad_kern.c
@@ -31,15 +31,9 @@ struct bpf_map_def SEC("maps") write_count = {
};
#undef DEBUG
-#ifdef DEBUG
-#define bpf_debug(fmt, ...) \
-({ \
- char ____fmt[] = fmt; \
- bpf_trace_printk(____fmt, sizeof(____fmt), \
- ##__VA_ARGS__); \
-})
-#else
-#define bpf_debug(fmt, ...)
+#ifndef DEBUG
+#undef bpf_printk
+#define bpf_printk(fmt, ...)
#endif
/* Taken from the current format defined in
@@ -86,7 +80,7 @@ int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx)
u64 zero = 0, *val;
u8 class = ctx->mgmt_class;
- bpf_debug("ib_umad read recv : class 0x%x\n", class);
+ bpf_printk("ib_umad read recv : class 0x%x\n", class);
val = bpf_map_lookup_elem(&read_count, &class);
if (!val) {
@@ -106,7 +100,7 @@ int on_ib_umad_read_send(struct ib_umad_rw_args *ctx)
u64 zero = 0, *val;
u8 class = ctx->mgmt_class;
- bpf_debug("ib_umad read send : class 0x%x\n", class);
+ bpf_printk("ib_umad read send : class 0x%x\n", class);
val = bpf_map_lookup_elem(&read_count, &class);
if (!val) {
@@ -126,7 +120,7 @@ int on_ib_umad_write(struct ib_umad_rw_args *ctx)
u64 zero = 0, *val;
u8 class = ctx->mgmt_class;
- bpf_debug("ib_umad write : class 0x%x\n", class);
+ bpf_printk("ib_umad write : class 0x%x\n", class);
val = bpf_map_lookup_elem(&write_count, &class);
if (!val) {
diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme
index fee746621aec..78e247f62108 100644
--- a/samples/bpf/tcp_bpf.readme
+++ b/samples/bpf/tcp_bpf.readme
@@ -25,4 +25,4 @@ attached to the cgroupv2).
To remove (unattach) a socket_ops BPF program from a cgroupv2:
- bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
+ bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c
new file mode 100644
index 000000000000..8557913106a0
--- /dev/null
+++ b/samples/bpf/tcp_dumpstats_kern.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Refer to samples/bpf/tcp_bpf.readme for the instructions on
+ * how to run this sample program.
+ */
+#include <linux/bpf.h>
+
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define INTERVAL 1000000000ULL
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __u32 type;
+ __u32 map_flags;
+ int *key;
+ __u64 *value;
+} bpf_next_dump SEC(".maps") = {
+ .type = BPF_MAP_TYPE_SK_STORAGE,
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+
+SEC("sockops")
+int _sockops(struct bpf_sock_ops *ctx)
+{
+ struct bpf_tcp_sock *tcp_sk;
+ struct bpf_sock *sk;
+ __u64 *next_dump;
+ __u64 now;
+
+ switch (ctx->op) {
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG);
+ return 1;
+ case BPF_SOCK_OPS_RTT_CB:
+ break;
+ default:
+ return 1;
+ }
+
+ sk = ctx->sk;
+ if (!sk)
+ return 1;
+
+ next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!next_dump)
+ return 1;
+
+ now = bpf_ktime_get_ns();
+ if (now < *next_dump)
+ return 1;
+
+ tcp_sk = bpf_tcp_sock(sk);
+ if (!tcp_sk)
+ return 1;
+
+ *next_dump = now + INTERVAL;
+
+ bpf_printk("dsack_dups=%u delivered=%u\n",
+ tcp_sk->dsack_dups, tcp_sk->delivered);
+ bpf_printk("delivered_ce=%u icsk_retransmits=%u\n",
+ tcp_sk->delivered_ce, tcp_sk->icsk_retransmits);
+
+ return 1;
+}
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
index 586ff751aba9..a3596b617c4c 100644
--- a/samples/bpf/xdp_adjust_tail_user.c
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <net/if.h>
#include <sys/resource.h>
#include <arpa/inet.h>
#include <netinet/ether.h>
@@ -69,7 +70,7 @@ static void usage(const char *cmd)
printf("Start a XDP prog which send ICMP \"packet too big\" \n"
"messages if ingress packet is bigger then MAX_SIZE bytes\n");
printf("Usage: %s [...]\n", cmd);
- printf(" -i <ifindex> Interface Index\n");
+ printf(" -i <ifname|ifindex> Interface\n");
printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n");
printf(" -S use skb-mode\n");
printf(" -N enforce native mode\n");
@@ -102,7 +103,9 @@ int main(int argc, char **argv)
switch (opt) {
case 'i':
- ifindex = atoi(optarg);
+ ifindex = if_nametoindex(optarg);
+ if (!ifindex)
+ ifindex = atoi(optarg);
break;
case 'T':
kill_after_s = atoi(optarg);
@@ -136,6 +139,11 @@ int main(int argc, char **argv)
return 1;
}
+ if (!ifindex) {
+ fprintf(stderr, "Invalid ifname\n");
+ return 1;
+ }
+
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
prog_load_attr.file = filename;
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index 15bb6f67f9c3..f70ee33907fd 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
+#include <net/if.h>
#include <unistd.h>
#include <libgen.h>
#include <sys/resource.h>
@@ -85,7 +86,7 @@ static void poll_stats(int interval, int ifindex)
static void usage(const char *prog)
{
fprintf(stderr,
- "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n"
+ "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n"
"OPTS:\n"
" -S use skb-mode\n"
" -N enforce native mode\n"
@@ -127,7 +128,7 @@ int main(int argc, char **argv)
}
if (optind == argc) {
- printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]);
+ printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
return 1;
}
@@ -136,8 +137,14 @@ int main(int argc, char **argv)
return 1;
}
- ifindex_in = strtoul(argv[optind], NULL, 0);
- ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+ ifindex_in = if_nametoindex(argv[optind]);
+ if (!ifindex_in)
+ ifindex_in = strtoul(argv[optind], NULL, 0);
+
+ ifindex_out = if_nametoindex(argv[optind + 1]);
+ if (!ifindex_out)
+ ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+
printf("input: %d output: %d\n", ifindex_in, ifindex_out);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index ce71be187205..39de06f3ec25 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -10,6 +10,7 @@
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
+#include <net/if.h>
#include <unistd.h>
#include <libgen.h>
#include <sys/resource.h>
@@ -85,7 +86,7 @@ static void poll_stats(int interval, int ifindex)
static void usage(const char *prog)
{
fprintf(stderr,
- "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n"
+ "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n"
"OPTS:\n"
" -S use skb-mode\n"
" -N enforce native mode\n"
@@ -128,7 +129,7 @@ int main(int argc, char **argv)
}
if (optind == argc) {
- printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]);
+ printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]);
return 1;
}
@@ -137,8 +138,14 @@ int main(int argc, char **argv)
return 1;
}
- ifindex_in = strtoul(argv[optind], NULL, 0);
- ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+ ifindex_in = if_nametoindex(argv[optind]);
+ if (!ifindex_in)
+ ifindex_in = strtoul(argv[optind], NULL, 0);
+
+ ifindex_out = if_nametoindex(argv[optind + 1]);
+ if (!ifindex_out)
+ ifindex_out = strtoul(argv[optind + 1], NULL, 0);
+
printf("input: %d output: %d\n", ifindex_in, ifindex_out);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index 394896430712..dfb68582e243 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -9,6 +9,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <net/if.h>
#include <sys/resource.h>
#include <arpa/inet.h>
#include <netinet/ether.h>
@@ -83,7 +84,7 @@ static void usage(const char *cmd)
"in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n"
"is used to select packets to encapsulate\n\n");
printf("Usage: %s [...]\n", cmd);
- printf(" -i <ifindex> Interface Index\n");
+ printf(" -i <ifname|ifindex> Interface\n");
printf(" -a <vip-service-address> IPv4 or IPv6\n");
printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n");
printf(" -s <source-ip> Used in the IPTunnel header\n");
@@ -181,7 +182,9 @@ int main(int argc, char **argv)
switch (opt) {
case 'i':
- ifindex = atoi(optarg);
+ ifindex = if_nametoindex(optarg);
+ if (!ifindex)
+ ifindex = atoi(optarg);
break;
case 'a':
vip.family = parse_ipstr(optarg, vip.daddr.v6);
@@ -253,6 +256,11 @@ int main(int argc, char **argv)
return 1;
}
+ if (!ifindex) {
+ fprintf(stderr, "Invalid ifname\n");
+ return 1;
+ }
+
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
prog_load_attr.file = filename;
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 0f5eb0d7f2df..93eaaf7239b2 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -68,6 +68,7 @@ static int opt_queue;
static int opt_poll;
static int opt_interval = 1;
static u32 opt_xdp_bind_flags;
+static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
static __u32 prog_id;
struct xsk_umem_info {
@@ -276,6 +277,12 @@ static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
{
struct xsk_umem_info *umem;
+ struct xsk_umem_config cfg = {
+ .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+ .frame_size = opt_xsk_frame_size,
+ .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
+ };
int ret;
umem = calloc(1, sizeof(*umem));
@@ -283,7 +290,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
exit_with_error(errno);
ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
- NULL);
+ &cfg);
if (ret)
exit_with_error(-ret);
@@ -323,11 +330,9 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem)
&idx);
if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
exit_with_error(-ret);
- for (i = 0;
- i < XSK_RING_PROD__DEFAULT_NUM_DESCS *
- XSK_UMEM__DEFAULT_FRAME_SIZE;
- i += XSK_UMEM__DEFAULT_FRAME_SIZE)
- *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i;
+ for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++)
+ *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) =
+ i * opt_xsk_frame_size;
xsk_ring_prod__submit(&xsk->umem->fq,
XSK_RING_PROD__DEFAULT_NUM_DESCS);
@@ -346,6 +351,7 @@ static struct option long_options[] = {
{"interval", required_argument, 0, 'n'},
{"zero-copy", no_argument, 0, 'z'},
{"copy", no_argument, 0, 'c'},
+ {"frame-size", required_argument, 0, 'f'},
{0, 0, 0, 0}
};
@@ -365,8 +371,9 @@ static void usage(const char *prog)
" -n, --interval=n Specify statistics update interval (default 1 sec).\n"
" -z, --zero-copy Force zero-copy mode.\n"
" -c, --copy Force copy mode.\n"
+ " -f, --frame-size=n Set the frame size (must be a power of two, default is %d).\n"
"\n";
- fprintf(stderr, str, prog);
+ fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
exit(EXIT_FAILURE);
}
@@ -377,7 +384,7 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
- c = getopt_long(argc, argv, "Frtli:q:psSNn:cz", long_options,
+ c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
&option_index);
if (c == -1)
break;
@@ -420,6 +427,9 @@ static void parse_command_line(int argc, char **argv)
case 'F':
opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
break;
+ case 'f':
+ opt_xsk_frame_size = atoi(optarg);
+ break;
default:
usage(basename(argv[0]));
}
@@ -432,6 +442,11 @@ static void parse_command_line(int argc, char **argv)
usage(basename(argv[0]));
}
+ if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) {
+ fprintf(stderr, "--frame-size=%d is not a power of two\n",
+ opt_xsk_frame_size);
+ usage(basename(argv[0]));
+ }
}
static void kick_tx(struct xsk_socket_info *xsk)
@@ -583,8 +598,7 @@ static void tx_only(struct xsk_socket_info *xsk)
for (i = 0; i < BATCH_SIZE; i++) {
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
- = (frame_nb + i) <<
- XSK_UMEM__DEFAULT_FRAME_SHIFT;
+ = (frame_nb + i) * opt_xsk_frame_size;
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
sizeof(pkt_data) - 1;
}
@@ -661,21 +675,19 @@ int main(int argc, char **argv)
}
ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
- NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+ NUM_FRAMES * opt_xsk_frame_size);
if (ret)
exit_with_error(ret);
/* Create sockets... */
- umem = xsk_configure_umem(bufs,
- NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+ umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
xsks[num_socks++] = xsk_configure_socket(umem);
if (opt_bench == BENCH_TXONLY) {
int i;
- for (i = 0; i < NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE;
- i += XSK_UMEM__DEFAULT_FRAME_SIZE)
- (void)gen_eth_frame(umem, i);
+ for (i = 0; i < NUM_FRAMES; i++)
+ (void)gen_eth_frame(umem, i * opt_xsk_frame_size);
}
signal(SIGINT, int_exit);
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index d80fdde79c22..585f270c2d25 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -29,7 +29,8 @@ CGROUP COMMANDS
| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
| *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
| **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** |
-| **sendmsg4** | **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** }
+| **sendmsg4** | **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** |
+| **getsockopt** | **setsockopt** }
| *ATTACH_FLAGS* := { **multi** | **override** }
DESCRIPTION
@@ -90,7 +91,9 @@ DESCRIPTION
an unconnected udp4 socket (since 5.2);
**recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for
an unconnected udp6 socket (since 5.2);
- **sysctl** sysctl access (since 5.2).
+ **sysctl** sysctl access (since 5.2);
+ **getsockopt** call to getsockopt (since 5.3);
+ **setsockopt** call to setsockopt (since 5.3).
**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
Detach *PROG* from the cgroup *CGROUP* and attach type
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 55dd06517a3b..1df637f85f94 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -40,7 +40,8 @@ PROG COMMANDS
| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** |
| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
| **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
-| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl**
+| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** |
+| **cgroup/getsockopt** | **cgroup/setsockopt**
| }
| *ATTACH_TYPE* := {
| **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector**
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index a17e84c67498..ba37095e1f62 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -379,7 +379,8 @@ _bpftool()
cgroup/sendmsg4 cgroup/sendmsg6 \
cgroup/recvmsg4 cgroup/recvmsg6 \
cgroup/post_bind4 cgroup/post_bind6 \
- cgroup/sysctl" -- \
+ cgroup/sysctl cgroup/getsockopt \
+ cgroup/setsockopt" -- \
"$cur" ) )
return 0
;;
@@ -689,7 +690,8 @@ _bpftool()
attach|detach)
local ATTACH_TYPES='ingress egress sock_create sock_ops \
device bind4 bind6 post_bind4 post_bind6 connect4 \
- connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl'
+ connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \
+ getsockopt setsockopt'
local ATTACH_FLAGS='multi override'
local PROG_TYPE='id pinned tag'
case $prev in
@@ -699,7 +701,8 @@ _bpftool()
;;
ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
post_bind4|post_bind6|connect4|connect6|sendmsg4|\
- sendmsg6|recvmsg4|recvmsg6|sysctl)
+ sendmsg6|recvmsg4|recvmsg6|sysctl|getsockopt|\
+ setsockopt)
COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
"$cur" ) )
return 0
diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index 73ec8ea33fb4..390b89a224f1 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -26,7 +26,8 @@
" sock_ops | device | bind4 | bind6 |\n" \
" post_bind4 | post_bind6 | connect4 |\n" \
" connect6 | sendmsg4 | sendmsg6 |\n" \
- " recvmsg4 | recvmsg6 | sysctl }"
+ " recvmsg4 | recvmsg6 | sysctl |\n" \
+ " getsockopt | setsockopt }"
static const char * const attach_type_strings[] = {
[BPF_CGROUP_INET_INGRESS] = "ingress",
@@ -45,6 +46,8 @@ static const char * const attach_type_strings[] = {
[BPF_CGROUP_SYSCTL] = "sysctl",
[BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
[BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
+ [BPF_CGROUP_GETSOCKOPT] = "getsockopt",
+ [BPF_CGROUP_SETSOCKOPT] = "setsockopt",
[__MAX_BPF_ATTACH_TYPE] = NULL,
};
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 28a2a5857e14..9c5d9c80f71e 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -74,6 +74,7 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport",
[BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector",
[BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl",
+ [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt",
};
extern const char * const map_type_name[];
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index f1a831f05010..9b0db5d14e31 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -1071,7 +1071,8 @@ static int do_help(int argc, char **argv)
" cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
" cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
" cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n"
- " cgroup/recvmsg6 }\n"
+ " cgroup/recvmsg6 | cgroup/getsockopt |\n"
+ " cgroup/setsockopt }\n"
" ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
" flow_dissector }\n"
" " HELP_SPEC_OPTIONS "\n"
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b077507efa3f..cecf42c871d4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_FLOW_DISSECTOR,
BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ BPF_PROG_TYPE_CGROUP_SOCKOPT,
};
enum bpf_attach_type {
@@ -194,6 +195,8 @@ enum bpf_attach_type {
BPF_CGROUP_SYSCTL,
BPF_CGROUP_UDP4_RECVMSG,
BPF_CGROUP_UDP6_RECVMSG,
+ BPF_CGROUP_GETSOCKOPT,
+ BPF_CGROUP_SETSOCKOPT,
__MAX_BPF_ATTACH_TYPE
};
@@ -1764,6 +1767,7 @@ union bpf_attr {
* * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
* * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
* * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
*
* Therefore, this function can be used to clear a callback flag by
* setting the appropriate bit to zero. e.g. to disable the RTO
@@ -3066,6 +3070,12 @@ struct bpf_tcp_sock {
* sum(delta(snd_una)), or how many bytes
* were acked.
*/
+ __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
+ * total number of DSACK blocks received
+ */
+ __u32 delivered; /* Total data packets delivered incl. rexmits */
+ __u32 delivered_ce; /* Like the above but only ECE marked packets */
+ __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */
};
struct bpf_sock_tuple {
@@ -3308,7 +3318,8 @@ struct bpf_sock_ops {
#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
-#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently
+#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently
* supported cb flags
*/
@@ -3363,6 +3374,8 @@ enum {
BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
* socket transition to LISTEN state.
*/
+ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -3541,4 +3554,15 @@ struct bpf_sysctl {
*/
};
+struct bpf_sockopt {
+ __bpf_md_ptr(struct bpf_sock *, sk);
+ __bpf_md_ptr(void *, optval);
+ __bpf_md_ptr(void *, optval_end);
+
+ __s32 level;
+ __s32 optname;
+ __s32 optlen;
+ __s32 retval;
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index caed8b1614ff..faaa5ca2a117 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -46,6 +46,7 @@ struct xdp_mmap_offsets {
#define XDP_UMEM_FILL_RING 5
#define XDP_UMEM_COMPLETION_RING 6
#define XDP_STATISTICS 7
+#define XDP_OPTIONS 8
struct xdp_umem_reg {
__u64 addr; /* Start of packet data area */
@@ -60,6 +61,13 @@ struct xdp_statistics {
__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
};
+struct xdp_options {
+ __u32 flags;
+};
+
+/* Flags for the flags field of struct xdp_options */
+#define XDP_OPTIONS_ZEROCOPY (1 << 0)
+
/* Pgoff for mmaping the rings */
#define XDP_PGOFF_RX_RING 0
#define XDP_PGOFF_TX_RING 0x80000000
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 4259c9f0cfe7..4907997289e9 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -778,7 +778,7 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj)
if (obj->nr_maps < obj->maps_cap)
return &obj->maps[obj->nr_maps++];
- new_cap = max(4ul, obj->maps_cap * 3 / 2);
+ new_cap = max((size_t)4, obj->maps_cap * 3 / 2);
new_maps = realloc(obj->maps, new_cap * sizeof(*obj->maps));
if (!new_maps) {
pr_warning("alloc maps for object failed\n");
@@ -1169,7 +1169,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
pr_debug("map '%s': found key_size = %u.\n",
map_name, sz);
if (map->def.key_size && map->def.key_size != sz) {
- pr_warning("map '%s': conflictling key size %u != %u.\n",
+ pr_warning("map '%s': conflicting key size %u != %u.\n",
map_name, map->def.key_size, sz);
return -EINVAL;
}
@@ -1197,7 +1197,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
pr_debug("map '%s': found key [%u], sz = %lld.\n",
map_name, t->type, sz);
if (map->def.key_size && map->def.key_size != sz) {
- pr_warning("map '%s': conflictling key size %u != %lld.\n",
+ pr_warning("map '%s': conflicting key size %u != %lld.\n",
map_name, map->def.key_size, sz);
return -EINVAL;
}
@@ -1212,7 +1212,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
pr_debug("map '%s': found value_size = %u.\n",
map_name, sz);
if (map->def.value_size && map->def.value_size != sz) {
- pr_warning("map '%s': conflictling value size %u != %u.\n",
+ pr_warning("map '%s': conflicting value size %u != %u.\n",
map_name, map->def.value_size, sz);
return -EINVAL;
}
@@ -1240,7 +1240,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
pr_debug("map '%s': found value [%u], sz = %lld.\n",
map_name, t->type, sz);
if (map->def.value_size && map->def.value_size != sz) {
- pr_warning("map '%s': conflictling value size %u != %lld.\n",
+ pr_warning("map '%s': conflicting value size %u != %lld.\n",
map_name, map->def.value_size, sz);
return -EINVAL;
}
@@ -2646,6 +2646,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
return false;
case BPF_PROG_TYPE_KPROBE:
default:
@@ -3604,6 +3605,10 @@ static const struct {
BPF_CGROUP_UDP6_RECVMSG),
BPF_EAPROG_SEC("cgroup/sysctl", BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_CGROUP_SYSCTL),
+ BPF_EAPROG_SEC("cgroup/getsockopt", BPF_PROG_TYPE_CGROUP_SOCKOPT,
+ BPF_CGROUP_GETSOCKOPT),
+ BPF_EAPROG_SEC("cgroup/setsockopt", BPF_PROG_TYPE_CGROUP_SOCKOPT,
+ BPF_CGROUP_SETSOCKOPT),
};
#undef BPF_PROG_SEC_IMPL
@@ -3867,10 +3872,7 @@ int bpf_prog_load(const char *file, enum bpf_prog_type type,
int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
struct bpf_object **pobj, int *prog_fd)
{
- struct bpf_object_open_attr open_attr = {
- .file = attr->file,
- .prog_type = attr->prog_type,
- };
+ struct bpf_object_open_attr open_attr = {};
struct bpf_program *prog, *first_prog = NULL;
enum bpf_attach_type expected_attach_type;
enum bpf_prog_type prog_type;
@@ -3883,6 +3885,9 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
if (!attr->file)
return -EINVAL;
+ open_attr.file = attr->file;
+ open_attr.prog_type = attr->prog_type;
+
obj = bpf_object__open_xattr(&open_attr);
if (IS_ERR_OR_NULL(obj))
return -ENOENT;
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 6635a31a7a16..ace1a0708d99 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -101,6 +101,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
case BPF_PROG_TYPE_SK_REUSEPORT:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
default:
break;
}
diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
index 7ef6293b4fd7..b33740221b7e 100644
--- a/tools/lib/bpf/xsk.c
+++ b/tools/lib/bpf/xsk.c
@@ -65,6 +65,7 @@ struct xsk_socket {
int xsks_map_fd;
__u32 queue_id;
char ifname[IFNAMSIZ];
+ bool zc;
};
struct xsk_nl_info {
@@ -326,7 +327,8 @@ static int xsk_get_max_queues(struct xsk_socket *xsk)
channels.cmd = ETHTOOL_GCHANNELS;
ifr.ifr_data = (void *)&channels;
- strncpy(ifr.ifr_name, xsk->ifname, IFNAMSIZ);
+ strncpy(ifr.ifr_name, xsk->ifname, IFNAMSIZ - 1);
+ ifr.ifr_name[IFNAMSIZ - 1] = '\0';
err = ioctl(fd, SIOCETHTOOL, &ifr);
if (err && errno != EOPNOTSUPP) {
ret = -errno;
@@ -480,6 +482,7 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
void *rx_map = NULL, *tx_map = NULL;
struct sockaddr_xdp sxdp = {};
struct xdp_mmap_offsets off;
+ struct xdp_options opts;
struct xsk_socket *xsk;
socklen_t optlen;
int err;
@@ -597,6 +600,16 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
}
xsk->prog_fd = -1;
+
+ optlen = sizeof(opts);
+ err = getsockopt(xsk->fd, SOL_XDP, XDP_OPTIONS, &opts, &optlen);
+ if (err) {
+ err = -errno;
+ goto out_mmap_tx;
+ }
+
+ xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY;
+
if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
err = xsk_setup_xdp_prog(xsk);
if (err)
diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h
index 82ea71a0f3ec..833a6e60d065 100644
--- a/tools/lib/bpf/xsk.h
+++ b/tools/lib/bpf/xsk.h
@@ -167,7 +167,7 @@ LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
#define XSK_RING_CONS__DEFAULT_NUM_DESCS 2048
#define XSK_RING_PROD__DEFAULT_NUM_DESCS 2048
-#define XSK_UMEM__DEFAULT_FRAME_SHIFT 11 /* 2048 bytes */
+#define XSK_UMEM__DEFAULT_FRAME_SHIFT 12 /* 4096 bytes */
#define XSK_UMEM__DEFAULT_FRAME_SIZE (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 7470327edcfe..a2f7f79c7908 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -39,3 +39,6 @@ libbpf.so.*
test_hashmap
test_btf_dump
xdping
+test_sockopt
+test_sockopt_sk
+test_sockopt_multi
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index fb5ce43e28b3..2620406a53ec 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -15,7 +15,7 @@ LLC ?= llc
LLVM_OBJCOPY ?= llvm-objcopy
LLVM_READELF ?= llvm-readelf
BTF_PAHOLE ?= pahole
-CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(BPFDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include \
+CFLAGS += -g -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(BPFDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include \
-Dbpf_prog_load=bpf_prog_test_load \
-Dbpf_load_program=bpf_test_load_program
LDLIBS += -lcap -lelf -lrt -lpthread
@@ -26,7 +26,8 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
test_sock test_btf test_sockmap get_cgroup_id_user test_socket_cookie \
test_cgroup_storage test_select_reuseport test_section_names \
test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
- test_btf_dump test_cgroup_attach xdping
+ test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
+ test_sockopt_multi test_tcp_rtt
BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
TEST_GEN_FILES = $(BPF_OBJ_FILES)
@@ -46,6 +47,7 @@ TEST_PROGS := test_kmod.sh \
test_libbpf.sh \
test_xdp_redirect.sh \
test_xdp_meta.sh \
+ test_xdp_veth.sh \
test_offload.py \
test_sock_addr.sh \
test_tunnel.sh \
@@ -102,6 +104,10 @@ $(OUTPUT)/test_netcnt: cgroup_helpers.c
$(OUTPUT)/test_sock_fields: cgroup_helpers.c
$(OUTPUT)/test_sysctl: cgroup_helpers.c
$(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
+$(OUTPUT)/test_sockopt: cgroup_helpers.c
+$(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
+$(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
+$(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
.PHONY: force
diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h
index 6b0781391be5..abf6224649be 100644
--- a/tools/testing/selftests/bpf/progs/pyperf.h
+++ b/tools/testing/selftests/bpf/progs/pyperf.h
@@ -75,8 +75,7 @@ typedef struct {
void* co_name; // PyCodeObject.co_name
} FrameData;
-static inline __attribute__((__always_inline__)) void*
-get_thread_state(void* tls_base, PidData* pidData)
+static __always_inline void *get_thread_state(void *tls_base, PidData *pidData)
{
void* thread_state;
int key;
@@ -87,8 +86,8 @@ get_thread_state(void* tls_base, PidData* pidData)
return thread_state;
}
-static inline __attribute__((__always_inline__)) bool
-get_frame_data(void* frame_ptr, PidData* pidData, FrameData* frame, Symbol* symbol)
+static __always_inline bool get_frame_data(void *frame_ptr, PidData *pidData,
+ FrameData *frame, Symbol *symbol)
{
// read data from PyFrameObject
bpf_probe_read(&frame->f_back,
@@ -161,7 +160,7 @@ struct bpf_elf_map SEC("maps") stackmap = {
.max_elem = 1000,
};
-static inline __attribute__((__always_inline__)) int __on_event(struct pt_regs *ctx)
+static __always_inline int __on_event(struct pt_regs *ctx)
{
uint64_t pid_tgid = bpf_get_current_pid_tgid();
pid_t pid = (pid_t)(pid_tgid >> 32);
diff --git a/tools/testing/selftests/bpf/progs/sockopt_multi.c b/tools/testing/selftests/bpf/progs/sockopt_multi.c
new file mode 100644
index 000000000000..4afd2595c08e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockopt_multi.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <netinet/in.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+SEC("cgroup/getsockopt/child")
+int _getsockopt_child(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_IP || ctx->optname != IP_TOS)
+ return 1;
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ if (optval[0] != 0x80)
+ return 0; /* EPERM, unexpected optval from the kernel */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ optval[0] = 0x90;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
+SEC("cgroup/getsockopt/parent")
+int _getsockopt_parent(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_IP || ctx->optname != IP_TOS)
+ return 1;
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ if (optval[0] != 0x90)
+ return 0; /* EPERM, unexpected optval from the kernel */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ optval[0] = 0xA0;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
+SEC("cgroup/setsockopt")
+int _setsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_IP || ctx->optname != IP_TOS)
+ return 1;
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ optval[0] += 0x10;
+ ctx->optlen = 1;
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
new file mode 100644
index 000000000000..076122c898e9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <netinet/in.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+#define SOL_CUSTOM 0xdeadbeef
+
+struct sockopt_sk {
+ __u8 val;
+};
+
+struct bpf_map_def SEC("maps") socket_storage_map = {
+ .type = BPF_MAP_TYPE_SK_STORAGE,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct sockopt_sk),
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+BPF_ANNOTATE_KV_PAIR(socket_storage_map, int, struct sockopt_sk);
+
+SEC("cgroup/getsockopt")
+int _getsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+ struct sockopt_sk *storage;
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_TOS)
+ /* Not interested in SOL_IP:IP_TOS;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ return 1;
+
+ if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
+ /* Not interested in SOL_SOCKET:SO_SNDBUF;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ return 1;
+ }
+
+ if (ctx->level != SOL_CUSTOM)
+ return 0; /* EPERM, deny everything except custom level */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ if (!ctx->retval)
+ return 0; /* EPERM, kernel should not have handled
+ * SOL_CUSTOM, something is wrong!
+ */
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ optval[0] = storage->val;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
+SEC("cgroup/setsockopt")
+int _setsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+ struct sockopt_sk *storage;
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_TOS)
+ /* Not interested in SOL_IP:IP_TOS;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ return 1;
+
+ if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
+ /* Overwrite SO_SNDBUF value */
+
+ if (optval + sizeof(__u32) > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ *(__u32 *)optval = 0x55AA;
+ ctx->optlen = 4;
+
+ return 1;
+ }
+
+ if (ctx->level != SOL_CUSTOM)
+ return 0; /* EPERM, deny everything except custom level */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ storage->val = optval[0];
+ ctx->optlen = -1; /* BPF has consumed this option, don't call kernel
+ * setsockopt handler.
+ */
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h
index 1ff73f60a3e4..553bc3b62e89 100644
--- a/tools/testing/selftests/bpf/progs/strobemeta.h
+++ b/tools/testing/selftests/bpf/progs/strobemeta.h
@@ -266,8 +266,8 @@ struct tls_index {
uint64_t offset;
};
-static inline __attribute__((always_inline))
-void *calc_location(struct strobe_value_loc *loc, void *tls_base)
+static __always_inline void *calc_location(struct strobe_value_loc *loc,
+ void *tls_base)
{
/*
* tls_mode value is:
@@ -327,10 +327,10 @@ void *calc_location(struct strobe_value_loc *loc, void *tls_base)
: NULL;
}
-static inline __attribute__((always_inline))
-void read_int_var(struct strobemeta_cfg *cfg, size_t idx, void *tls_base,
- struct strobe_value_generic *value,
- struct strobemeta_payload *data)
+static __always_inline void read_int_var(struct strobemeta_cfg *cfg,
+ size_t idx, void *tls_base,
+ struct strobe_value_generic *value,
+ struct strobemeta_payload *data)
{
void *location = calc_location(&cfg->int_locs[idx], tls_base);
if (!location)
@@ -342,10 +342,11 @@ void read_int_var(struct strobemeta_cfg *cfg, size_t idx, void *tls_base,
data->int_vals_set_mask |= (1 << idx);
}
-static inline __attribute__((always_inline))
-uint64_t read_str_var(struct strobemeta_cfg* cfg, size_t idx, void *tls_base,
- struct strobe_value_generic *value,
- struct strobemeta_payload *data, void *payload)
+static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
+ size_t idx, void *tls_base,
+ struct strobe_value_generic *value,
+ struct strobemeta_payload *data,
+ void *payload)
{
void *location;
uint32_t len;
@@ -371,10 +372,11 @@ uint64_t read_str_var(struct strobemeta_cfg* cfg, size_t idx, void *tls_base,
return len;
}
-static inline __attribute__((always_inline))
-void *read_map_var(struct strobemeta_cfg *cfg, size_t idx, void *tls_base,
- struct strobe_value_generic *value,
- struct strobemeta_payload* data, void *payload)
+static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
+ size_t idx, void *tls_base,
+ struct strobe_value_generic *value,
+ struct strobemeta_payload *data,
+ void *payload)
{
struct strobe_map_descr* descr = &data->map_descrs[idx];
struct strobe_map_raw map;
@@ -435,9 +437,9 @@ void *read_map_var(struct strobemeta_cfg *cfg, size_t idx, void *tls_base,
* read_strobe_meta returns NULL, if no metadata was read; otherwise returns
* pointer to *right after* payload ends
*/
-static inline __attribute__((always_inline))
-void *read_strobe_meta(struct task_struct* task,
- struct strobemeta_payload* data) {
+static __always_inline void *read_strobe_meta(struct task_struct *task,
+ struct strobemeta_payload *data)
+{
pid_t pid = bpf_get_current_pid_tgid() >> 32;
struct strobe_value_generic value = {0};
struct strobemeta_cfg *cfg;
diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c
new file mode 100644
index 000000000000..233bdcb1659e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+struct tcp_rtt_storage {
+ __u32 invoked;
+ __u32 dsack_dups;
+ __u32 delivered;
+ __u32 delivered_ce;
+ __u32 icsk_retransmits;
+};
+
+struct bpf_map_def SEC("maps") socket_storage_map = {
+ .type = BPF_MAP_TYPE_SK_STORAGE,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct tcp_rtt_storage),
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+BPF_ANNOTATE_KV_PAIR(socket_storage_map, int, struct tcp_rtt_storage);
+
+SEC("sockops")
+int _sockops(struct bpf_sock_ops *ctx)
+{
+ struct tcp_rtt_storage *storage;
+ struct bpf_tcp_sock *tcp_sk;
+ int op = (int) ctx->op;
+ struct bpf_sock *sk;
+
+ sk = ctx->sk;
+ if (!sk)
+ return 1;
+
+ storage = bpf_sk_storage_get(&socket_storage_map, sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 1;
+
+ if (op == BPF_SOCK_OPS_TCP_CONNECT_CB) {
+ bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG);
+ return 1;
+ }
+
+ if (op != BPF_SOCK_OPS_RTT_CB)
+ return 1;
+
+ tcp_sk = bpf_tcp_sock(sk);
+ if (!tcp_sk)
+ return 1;
+
+ storage->invoked++;
+
+ storage->dsack_dups = tcp_sk->dsack_dups;
+ storage->delivered = tcp_sk->delivered;
+ storage->delivered_ce = tcp_sk->delivered_ce;
+ storage->icsk_retransmits = tcp_sk->icsk_retransmits;
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_jhash.h b/tools/testing/selftests/bpf/progs/test_jhash.h
index 3d12c11a8d47..c300734d26f6 100644
--- a/tools/testing/selftests/bpf/progs/test_jhash.h
+++ b/tools/testing/selftests/bpf/progs/test_jhash.h
@@ -1,9 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Facebook
+#include <features.h>
typedef unsigned int u32;
-static __attribute__((always_inline)) u32 rol32(u32 word, unsigned int shift)
+static __always_inline u32 rol32(u32 word, unsigned int shift)
{
return (word << shift) | (word >> ((-shift) & 31));
}
diff --git a/tools/testing/selftests/bpf/progs/test_seg6_loop.c b/tools/testing/selftests/bpf/progs/test_seg6_loop.c
index 463964d79f73..1dbe1d4d467e 100644
--- a/tools/testing/selftests/bpf/progs/test_seg6_loop.c
+++ b/tools/testing/selftests/bpf/progs/test_seg6_loop.c
@@ -54,7 +54,7 @@ struct sr6_tlv_t {
unsigned char value[0];
} BPF_PACKET_HEADER;
-static __attribute__((always_inline)) struct ip6_srh_t *get_srh(struct __sk_buff *skb)
+static __always_inline struct ip6_srh_t *get_srh(struct __sk_buff *skb)
{
void *cursor, *data_end;
struct ip6_srh_t *srh;
@@ -88,9 +88,9 @@ static __attribute__((always_inline)) struct ip6_srh_t *get_srh(struct __sk_buff
return srh;
}
-static __attribute__((always_inline))
-int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad,
- uint32_t old_pad, uint32_t pad_off)
+static __always_inline int update_tlv_pad(struct __sk_buff *skb,
+ uint32_t new_pad, uint32_t old_pad,
+ uint32_t pad_off)
{
int err;
@@ -118,10 +118,11 @@ int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad,
return 0;
}
-static __attribute__((always_inline))
-int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh,
- uint32_t *tlv_off, uint32_t *pad_size,
- uint32_t *pad_off)
+static __always_inline int is_valid_tlv_boundary(struct __sk_buff *skb,
+ struct ip6_srh_t *srh,
+ uint32_t *tlv_off,
+ uint32_t *pad_size,
+ uint32_t *pad_off)
{
uint32_t srh_off, cur_off;
int offset_valid = 0;
@@ -177,9 +178,9 @@ int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh,
return 0;
}
-static __attribute__((always_inline))
-int add_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off,
- struct sr6_tlv_t *itlv, uint8_t tlv_size)
+static __always_inline int add_tlv(struct __sk_buff *skb,
+ struct ip6_srh_t *srh, uint32_t tlv_off,
+ struct sr6_tlv_t *itlv, uint8_t tlv_size)
{
uint32_t srh_off = (char *)srh - (char *)(long)skb->data;
uint8_t len_remaining, new_pad;
diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale2.c b/tools/testing/selftests/bpf/progs/test_verif_scale2.c
index 77830693eccb..9897150ed516 100644
--- a/tools/testing/selftests/bpf/progs/test_verif_scale2.c
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale2.c
@@ -2,7 +2,7 @@
// Copyright (c) 2019 Facebook
#include <linux/bpf.h>
#include "bpf_helpers.h"
-#define ATTR __attribute__((always_inline))
+#define ATTR __always_inline
#include "test_jhash.h"
SEC("scale90_inline")
diff --git a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c
new file mode 100644
index 000000000000..e87a985b9df9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") tx_port = {
+ .type = BPF_MAP_TYPE_DEVMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 8,
+};
+
+SEC("redirect_map_0")
+int xdp_redirect_map_0(struct xdp_md *xdp)
+{
+ return bpf_redirect_map(&tx_port, 0, 0);
+}
+
+SEC("redirect_map_1")
+int xdp_redirect_map_1(struct xdp_md *xdp)
+{
+ return bpf_redirect_map(&tx_port, 1, 0);
+}
+
+SEC("redirect_map_2")
+int xdp_redirect_map_2(struct xdp_md *xdp)
+{
+ return bpf_redirect_map(&tx_port, 2, 0);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/xdp_tx.c b/tools/testing/selftests/bpf/progs/xdp_tx.c
new file mode 100644
index 000000000000..57912e7c94b0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_tx.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("tx")
+int xdp_tx(struct xdp_md *xdp)
+{
+ return XDP_TX;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_section_names.c b/tools/testing/selftests/bpf/test_section_names.c
index dee2f2eceb0f..29833aeaf0de 100644
--- a/tools/testing/selftests/bpf/test_section_names.c
+++ b/tools/testing/selftests/bpf/test_section_names.c
@@ -134,6 +134,16 @@ static struct sec_name_test tests[] = {
{0, BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_CGROUP_SYSCTL},
{0, BPF_CGROUP_SYSCTL},
},
+ {
+ "cgroup/getsockopt",
+ {0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT},
+ {0, BPF_CGROUP_GETSOCKOPT},
+ },
+ {
+ "cgroup/setsockopt",
+ {0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT},
+ {0, BPF_CGROUP_SETSOCKOPT},
+ },
};
static int test_prog_type_by_name(const struct sec_name_test *test)
diff --git a/tools/testing/selftests/bpf/test_sockopt.c b/tools/testing/selftests/bpf/test_sockopt.c
new file mode 100644
index 000000000000..23bd0819382d
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockopt.c
@@ -0,0 +1,1021 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <linux/filter.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH "/sockopt"
+
+static char bpf_log_buf[4096];
+static bool verbose;
+
+enum sockopt_test_error {
+ OK = 0,
+ DENY_LOAD,
+ DENY_ATTACH,
+ EPERM_GETSOCKOPT,
+ EFAULT_GETSOCKOPT,
+ EPERM_SETSOCKOPT,
+ EFAULT_SETSOCKOPT,
+};
+
+static struct sockopt_test {
+ const char *descr;
+ const struct bpf_insn insns[64];
+ enum bpf_attach_type attach_type;
+ enum bpf_attach_type expected_attach_type;
+
+ int set_optname;
+ int set_level;
+ const char set_optval[64];
+ socklen_t set_optlen;
+
+ int get_optname;
+ int get_level;
+ const char get_optval[64];
+ socklen_t get_optlen;
+ socklen_t get_optlen_ret;
+
+ enum sockopt_test_error error;
+} tests[] = {
+
+ /* ==================== getsockopt ==================== */
+
+ {
+ .descr = "getsockopt: no expected_attach_type",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = 0,
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: wrong expected_attach_type",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+ .error = DENY_ATTACH,
+ },
+ {
+ .descr = "getsockopt: bypass bpf hook",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 1 << 3 },
+ .set_optlen = 1,
+
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "getsockopt: return EPERM from bpf hook",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .get_optname = IP_TOS,
+
+ .get_optlen = 1,
+ .error = EPERM_GETSOCKOPT,
+ },
+ {
+ .descr = "getsockopt: no optval bounds check, deny loading",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+
+ /* ctx->optval[0] = 0x80 */
+ BPF_MOV64_IMM(BPF_REG_0, 0x80),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_0, 0),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: read ctx->level",
+ .insns = {
+ /* r6 = ctx->level */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, level)),
+
+ /* if (ctx->level == 123) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 123, 4),
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = 123,
+
+ .get_optlen = 1,
+ },
+ {
+ .descr = "getsockopt: deny writing to ctx->level",
+ .insns = {
+ /* ctx->level = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, level)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: read ctx->optname",
+ .insns = {
+ /* r6 = ctx->optname */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optname)),
+
+ /* if (ctx->optname == 123) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 123, 4),
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optname = 123,
+
+ .get_optlen = 1,
+ },
+ {
+ .descr = "getsockopt: read ctx->retval",
+ .insns = {
+ /* r6 = ctx->retval */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .get_optname = IP_TOS,
+ .get_optlen = 1,
+ },
+ {
+ .descr = "getsockopt: deny writing to ctx->optname",
+ .insns = {
+ /* ctx->optname = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optname)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: read ctx->optlen",
+ .insns = {
+ /* r6 = ctx->optlen */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optlen)),
+
+ /* if (ctx->optlen == 64) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 64, 4),
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optlen = 64,
+ },
+ {
+ .descr = "getsockopt: deny bigger ctx->optlen",
+ .insns = {
+ /* ctx->optlen = 65 */
+ BPF_MOV64_IMM(BPF_REG_0, 65),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optlen = 64,
+
+ .error = EFAULT_GETSOCKOPT,
+ },
+ {
+ .descr = "getsockopt: deny arbitrary ctx->retval",
+ .insns = {
+ /* ctx->retval = 123 */
+ BPF_MOV64_IMM(BPF_REG_0, 123),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optlen = 64,
+
+ .error = EFAULT_GETSOCKOPT,
+ },
+ {
+ .descr = "getsockopt: support smaller ctx->optlen",
+ .insns = {
+ /* ctx->optlen = 32 */
+ BPF_MOV64_IMM(BPF_REG_0, 32),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optlen = 64,
+ .get_optlen_ret = 32,
+ },
+ {
+ .descr = "getsockopt: deny writing to ctx->optval",
+ .insns = {
+ /* ctx->optval = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optval)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: deny writing to ctx->optval_end",
+ .insns = {
+ /* ctx->optval_end = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optval_end)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: rewrite value",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+ /* r2 = ctx->optval */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+ /* r6 = ctx->optval + 1 */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+
+ /* r7 = ctx->optval_end */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval_end)),
+
+ /* if (ctx->optval + 1 <= ctx->optval_end) { */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1),
+ /* ctx->optval[0] = 0xF0 */
+ BPF_ST_MEM(BPF_B, BPF_REG_2, 0, 0xF0),
+ /* } */
+
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1*/
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .get_optname = IP_TOS,
+
+ .get_optval = { 0xF0 },
+ .get_optlen = 1,
+ },
+
+ /* ==================== setsockopt ==================== */
+
+ {
+ .descr = "setsockopt: no expected_attach_type",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = 0,
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: wrong expected_attach_type",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+ .error = DENY_ATTACH,
+ },
+ {
+ .descr = "setsockopt: bypass bpf hook",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 1 << 3 },
+ .set_optlen = 1,
+
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: return EPERM from bpf hook",
+ .insns = {
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_level = SOL_IP,
+ .set_optname = IP_TOS,
+
+ .set_optlen = 1,
+ .error = EPERM_SETSOCKOPT,
+ },
+ {
+ .descr = "setsockopt: no optval bounds check, deny loading",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+
+ /* r0 = ctx->optval[0] */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: read ctx->level",
+ .insns = {
+ /* r6 = ctx->level */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, level)),
+
+ /* if (ctx->level == 123) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 123, 4),
+ /* ctx->optlen = -1 */
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_level = 123,
+
+ .set_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: allow changing ctx->level",
+ .insns = {
+ /* ctx->level = SOL_IP */
+ BPF_MOV64_IMM(BPF_REG_0, SOL_IP),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, level)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = 234, /* should be rewritten to SOL_IP */
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 1 << 3 },
+ .set_optlen = 1,
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: read ctx->optname",
+ .insns = {
+ /* r6 = ctx->optname */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optname)),
+
+ /* if (ctx->optname == 123) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 123, 4),
+ /* ctx->optlen = -1 */
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optname = 123,
+
+ .set_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: allow changing ctx->optname",
+ .insns = {
+ /* ctx->optname = IP_TOS */
+ BPF_MOV64_IMM(BPF_REG_0, IP_TOS),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optname)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = 456, /* should be rewritten to IP_TOS */
+
+ .set_optval = { 1 << 3 },
+ .set_optlen = 1,
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: read ctx->optlen",
+ .insns = {
+ /* r6 = ctx->optlen */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optlen)),
+
+ /* if (ctx->optlen == 64) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 64, 4),
+ /* ctx->optlen = -1 */
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optlen = 64,
+ },
+ {
+ .descr = "setsockopt: ctx->optlen == -1 is ok",
+ .insns = {
+ /* ctx->optlen = -1 */
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optlen = 64,
+ },
+ {
+ .descr = "setsockopt: deny ctx->optlen < 0 (except -1)",
+ .insns = {
+ /* ctx->optlen = -2 */
+ BPF_MOV64_IMM(BPF_REG_0, -2),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optlen = 4,
+
+ .error = EFAULT_SETSOCKOPT,
+ },
+ {
+ .descr = "setsockopt: deny ctx->optlen > input optlen",
+ .insns = {
+ /* ctx->optlen = 65 */
+ BPF_MOV64_IMM(BPF_REG_0, 65),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optlen = 64,
+
+ .error = EFAULT_SETSOCKOPT,
+ },
+ {
+ .descr = "setsockopt: allow changing ctx->optlen within bounds",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+ /* r2 = ctx->optval */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+ /* r6 = ctx->optval + 1 */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+
+ /* r7 = ctx->optval_end */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval_end)),
+
+ /* if (ctx->optval + 1 <= ctx->optval_end) { */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1),
+ /* ctx->optval[0] = 1 << 3 */
+ BPF_ST_MEM(BPF_B, BPF_REG_2, 0, 1 << 3),
+ /* } */
+
+ /* ctx->optlen = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+
+ /* return 1*/
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 1, 1, 1, 1 },
+ .set_optlen = 4,
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: deny write ctx->retval",
+ .insns = {
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: deny read ctx->retval",
+ .insns = {
+ /* r6 = ctx->retval */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: deny writing to ctx->optval",
+ .insns = {
+ /* ctx->optval = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optval)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: deny writing to ctx->optval_end",
+ .insns = {
+ /* ctx->optval_end = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optval_end)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: allow IP_TOS <= 128",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+ /* r7 = ctx->optval + 1 */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1),
+
+ /* r8 = ctx->optval_end */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval_end)),
+
+ /* if (ctx->optval + 1 <= ctx->optval_end) { */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_7, BPF_REG_8, 4),
+
+ /* r9 = ctx->optval[0] */
+ BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_6, 0),
+
+ /* if (ctx->optval[0] < 128) */
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_9, 128, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } */
+
+ /* } else { */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 0x80 },
+ .set_optlen = 1,
+ .get_optval = { 0x80 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: deny IP_TOS > 128",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+ /* r7 = ctx->optval + 1 */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1),
+
+ /* r8 = ctx->optval_end */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval_end)),
+
+ /* if (ctx->optval + 1 <= ctx->optval_end) { */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_7, BPF_REG_8, 4),
+
+ /* r9 = ctx->optval[0] */
+ BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_6, 0),
+
+ /* if (ctx->optval[0] < 128) */
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_9, 128, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } */
+
+ /* } else { */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 0x81 },
+ .set_optlen = 1,
+ .get_optval = { 0x00 },
+ .get_optlen = 1,
+
+ .error = EPERM_SETSOCKOPT,
+ },
+};
+
+static int load_prog(const struct bpf_insn *insns,
+ enum bpf_attach_type expected_attach_type)
+{
+ struct bpf_load_program_attr attr = {
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT,
+ .expected_attach_type = expected_attach_type,
+ .insns = insns,
+ .license = "GPL",
+ .log_level = 2,
+ };
+ int fd;
+
+ for (;
+ insns[attr.insns_cnt].code != (BPF_JMP | BPF_EXIT);
+ attr.insns_cnt++) {
+ }
+ attr.insns_cnt++;
+
+ fd = bpf_load_program_xattr(&attr, bpf_log_buf, sizeof(bpf_log_buf));
+ if (verbose && fd < 0)
+ fprintf(stderr, "%s\n", bpf_log_buf);
+
+ return fd;
+}
+
+static int run_test(int cgroup_fd, struct sockopt_test *test)
+{
+ int sock_fd, err, prog_fd;
+ void *optval = NULL;
+ int ret = 0;
+
+ prog_fd = load_prog(test->insns, test->expected_attach_type);
+ if (prog_fd < 0) {
+ if (test->error == DENY_LOAD)
+ return 0;
+
+ log_err("Failed to load BPF program");
+ return -1;
+ }
+
+ err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0);
+ if (err < 0) {
+ if (test->error == DENY_ATTACH)
+ goto close_prog_fd;
+
+ log_err("Failed to attach BPF program");
+ ret = -1;
+ goto close_prog_fd;
+ }
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock_fd < 0) {
+ log_err("Failed to create AF_INET socket");
+ ret = -1;
+ goto detach_prog;
+ }
+
+ if (test->set_optlen) {
+ err = setsockopt(sock_fd, test->set_level, test->set_optname,
+ test->set_optval, test->set_optlen);
+ if (err) {
+ if (errno == EPERM && test->error == EPERM_SETSOCKOPT)
+ goto close_sock_fd;
+ if (errno == EFAULT && test->error == EFAULT_SETSOCKOPT)
+ goto free_optval;
+
+ log_err("Failed to call setsockopt");
+ ret = -1;
+ goto close_sock_fd;
+ }
+ }
+
+ if (test->get_optlen) {
+ optval = malloc(test->get_optlen);
+ socklen_t optlen = test->get_optlen;
+ socklen_t expected_get_optlen = test->get_optlen_ret ?:
+ test->get_optlen;
+
+ err = getsockopt(sock_fd, test->get_level, test->get_optname,
+ optval, &optlen);
+ if (err) {
+ if (errno == EPERM && test->error == EPERM_GETSOCKOPT)
+ goto free_optval;
+ if (errno == EFAULT && test->error == EFAULT_GETSOCKOPT)
+ goto free_optval;
+
+ log_err("Failed to call getsockopt");
+ ret = -1;
+ goto free_optval;
+ }
+
+ if (optlen != expected_get_optlen) {
+ errno = 0;
+ log_err("getsockopt returned unexpected optlen");
+ ret = -1;
+ goto free_optval;
+ }
+
+ if (memcmp(optval, test->get_optval, optlen) != 0) {
+ errno = 0;
+ log_err("getsockopt returned unexpected optval");
+ ret = -1;
+ goto free_optval;
+ }
+ }
+
+ ret = test->error != OK;
+
+free_optval:
+ free(optval);
+close_sock_fd:
+ close(sock_fd);
+detach_prog:
+ bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type);
+close_prog_fd:
+ close(prog_fd);
+ return ret;
+}
+
+int main(int args, char **argv)
+{
+ int err = EXIT_FAILURE, error_cnt = 0;
+ int cgroup_fd, i;
+
+ if (setup_cgroup_environment())
+ goto cleanup_obj;
+
+ cgroup_fd = create_and_get_cgroup(CG_PATH);
+ if (cgroup_fd < 0)
+ goto cleanup_cgroup_env;
+
+ if (join_cgroup(CG_PATH))
+ goto cleanup_cgroup;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ int err = run_test(cgroup_fd, &tests[i]);
+
+ if (err)
+ error_cnt++;
+
+ printf("#%d %s: %s\n", i, err ? "FAIL" : "PASS",
+ tests[i].descr);
+ }
+
+ printf("Summary: %ld PASSED, %d FAILED\n",
+ ARRAY_SIZE(tests) - error_cnt, error_cnt);
+ err = error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
+
+cleanup_cgroup:
+ close(cgroup_fd);
+cleanup_cgroup_env:
+ cleanup_cgroup_environment();
+cleanup_obj:
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_sockopt_multi.c b/tools/testing/selftests/bpf/test_sockopt_multi.c
new file mode 100644
index 000000000000..4be3441db867
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockopt_multi.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <linux/filter.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+ enum bpf_attach_type attach_type;
+ enum bpf_prog_type prog_type;
+ struct bpf_program *prog;
+ int err;
+
+ err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+ if (err) {
+ log_err("Failed to deduct types for %s BPF program", title);
+ return -1;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, title);
+ if (!prog) {
+ log_err("Failed to find %s BPF program", title);
+ return -1;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
+ attach_type, BPF_F_ALLOW_MULTI);
+ if (err) {
+ log_err("Failed to attach %s BPF program", title);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int prog_detach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+ enum bpf_attach_type attach_type;
+ enum bpf_prog_type prog_type;
+ struct bpf_program *prog;
+ int err;
+
+ err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+ if (err)
+ return -1;
+
+ prog = bpf_object__find_program_by_title(obj, title);
+ if (!prog)
+ return -1;
+
+ err = bpf_prog_detach2(bpf_program__fd(prog), cgroup_fd,
+ attach_type);
+ if (err)
+ return -1;
+
+ return 0;
+}
+
+static int run_getsockopt_test(struct bpf_object *obj, int cg_parent,
+ int cg_child, int sock_fd)
+{
+ socklen_t optlen;
+ __u8 buf;
+ int err;
+
+ /* Set IP_TOS to the expected value (0x80). */
+
+ buf = 0x80;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x80) {
+ log_err("Unexpected getsockopt 0x%x != 0x80 without BPF", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Attach child program and make sure it returns new value:
+ * - kernel: -> 0x80
+ * - child: 0x80 -> 0x90
+ */
+
+ err = prog_attach(obj, cg_child, "cgroup/getsockopt/child");
+ if (err)
+ goto detach;
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x90) {
+ log_err("Unexpected getsockopt 0x%x != 0x90", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Attach parent program and make sure it returns new value:
+ * - kernel: -> 0x80
+ * - child: 0x80 -> 0x90
+ * - parent: 0x90 -> 0xA0
+ */
+
+ err = prog_attach(obj, cg_parent, "cgroup/getsockopt/parent");
+ if (err)
+ goto detach;
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0xA0) {
+ log_err("Unexpected getsockopt 0x%x != 0xA0", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Setting unexpected initial sockopt should return EPERM:
+ * - kernel: -> 0x40
+ * - child: unexpected 0x40, EPERM
+ * - parent: unexpected 0x40, EPERM
+ */
+
+ buf = 0x40;
+ if (setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1) < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (!err) {
+ log_err("Unexpected success from getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ /* Detach child program and make sure we still get EPERM:
+ * - kernel: -> 0x40
+ * - parent: unexpected 0x40, EPERM
+ */
+
+ err = prog_detach(obj, cg_child, "cgroup/getsockopt/child");
+ if (err) {
+ log_err("Failed to detach child program");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (!err) {
+ log_err("Unexpected success from getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ /* Set initial value to the one the parent program expects:
+ * - kernel: -> 0x90
+ * - parent: 0x90 -> 0xA0
+ */
+
+ buf = 0x90;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0xA0) {
+ log_err("Unexpected getsockopt 0x%x != 0xA0", buf);
+ err = -1;
+ goto detach;
+ }
+
+detach:
+ prog_detach(obj, cg_child, "cgroup/getsockopt/child");
+ prog_detach(obj, cg_parent, "cgroup/getsockopt/parent");
+
+ return err;
+}
+
+static int run_setsockopt_test(struct bpf_object *obj, int cg_parent,
+ int cg_child, int sock_fd)
+{
+ socklen_t optlen;
+ __u8 buf;
+ int err;
+
+ /* Set IP_TOS to the expected value (0x80). */
+
+ buf = 0x80;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x80) {
+ log_err("Unexpected getsockopt 0x%x != 0x80 without BPF", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Attach child program and make sure it adds 0x10. */
+
+ err = prog_attach(obj, cg_child, "cgroup/setsockopt");
+ if (err)
+ goto detach;
+
+ buf = 0x80;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x80 + 0x10) {
+ log_err("Unexpected getsockopt 0x%x != 0x80 + 0x10", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Attach parent program and make sure it adds another 0x10. */
+
+ err = prog_attach(obj, cg_parent, "cgroup/setsockopt");
+ if (err)
+ goto detach;
+
+ buf = 0x80;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x80 + 2 * 0x10) {
+ log_err("Unexpected getsockopt 0x%x != 0x80 + 2 * 0x10", buf);
+ err = -1;
+ goto detach;
+ }
+
+detach:
+ prog_detach(obj, cg_child, "cgroup/setsockopt");
+ prog_detach(obj, cg_parent, "cgroup/setsockopt");
+
+ return err;
+}
+
+int main(int argc, char **argv)
+{
+ struct bpf_prog_load_attr attr = {
+ .file = "./sockopt_multi.o",
+ };
+ int cg_parent = -1, cg_child = -1;
+ struct bpf_object *obj = NULL;
+ int sock_fd = -1;
+ int err = -1;
+ int ignored;
+
+ if (setup_cgroup_environment()) {
+ log_err("Failed to setup cgroup environment\n");
+ goto out;
+ }
+
+ cg_parent = create_and_get_cgroup("/parent");
+ if (cg_parent < 0) {
+ log_err("Failed to create cgroup /parent\n");
+ goto out;
+ }
+
+ cg_child = create_and_get_cgroup("/parent/child");
+ if (cg_child < 0) {
+ log_err("Failed to create cgroup /parent/child\n");
+ goto out;
+ }
+
+ if (join_cgroup("/parent/child")) {
+ log_err("Failed to join cgroup /parent/child\n");
+ goto out;
+ }
+
+ err = bpf_prog_load_xattr(&attr, &obj, &ignored);
+ if (err) {
+ log_err("Failed to load BPF object");
+ goto out;
+ }
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock_fd < 0) {
+ log_err("Failed to create socket");
+ goto out;
+ }
+
+ if (run_getsockopt_test(obj, cg_parent, cg_child, sock_fd))
+ err = -1;
+ printf("test_sockopt_multi: getsockopt %s\n",
+ err ? "FAILED" : "PASSED");
+
+ if (run_setsockopt_test(obj, cg_parent, cg_child, sock_fd))
+ err = -1;
+ printf("test_sockopt_multi: setsockopt %s\n",
+ err ? "FAILED" : "PASSED");
+
+out:
+ close(sock_fd);
+ bpf_object__close(obj);
+ close(cg_child);
+ close(cg_parent);
+
+ printf("test_sockopt_multi: %s\n", err ? "FAILED" : "PASSED");
+ return err ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/bpf/test_sockopt_sk.c b/tools/testing/selftests/bpf/test_sockopt_sk.c
new file mode 100644
index 000000000000..036b652e5ca9
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockopt_sk.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <linux/filter.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH "/sockopt"
+
+#define SOL_CUSTOM 0xdeadbeef
+
+static int getsetsockopt(void)
+{
+ int fd, err;
+ union {
+ char u8[4];
+ __u32 u32;
+ } buf = {};
+ socklen_t optlen;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create socket");
+ return -1;
+ }
+
+ /* IP_TOS - BPF bypass */
+
+ buf.u8[0] = 0x08;
+ err = setsockopt(fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto err;
+ }
+
+ buf.u8[0] = 0x00;
+ optlen = 1;
+ err = getsockopt(fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto err;
+ }
+
+ if (buf.u8[0] != 0x08) {
+ log_err("Unexpected getsockopt(IP_TOS) buf[0] 0x%02x != 0x08",
+ buf.u8[0]);
+ goto err;
+ }
+
+ /* IP_TTL - EPERM */
+
+ buf.u8[0] = 1;
+ err = setsockopt(fd, SOL_IP, IP_TTL, &buf, 1);
+ if (!err || errno != EPERM) {
+ log_err("Unexpected success from setsockopt(IP_TTL)");
+ goto err;
+ }
+
+ /* SOL_CUSTOM - handled by BPF */
+
+ buf.u8[0] = 0x01;
+ err = setsockopt(fd, SOL_CUSTOM, 0, &buf, 1);
+ if (err) {
+ log_err("Failed to call setsockopt");
+ goto err;
+ }
+
+ buf.u32 = 0x00;
+ optlen = 4;
+ err = getsockopt(fd, SOL_CUSTOM, 0, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt");
+ goto err;
+ }
+
+ if (optlen != 1) {
+ log_err("Unexpected optlen %d != 1", optlen);
+ goto err;
+ }
+ if (buf.u8[0] != 0x01) {
+ log_err("Unexpected buf[0] 0x%02x != 0x01", buf.u8[0]);
+ goto err;
+ }
+
+ /* SO_SNDBUF is overwritten */
+
+ buf.u32 = 0x01010101;
+ err = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf, 4);
+ if (err) {
+ log_err("Failed to call setsockopt(SO_SNDBUF)");
+ goto err;
+ }
+
+ buf.u32 = 0x00;
+ optlen = 4;
+ err = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(SO_SNDBUF)");
+ goto err;
+ }
+
+ if (buf.u32 != 0x55AA*2) {
+ log_err("Unexpected getsockopt(SO_SNDBUF) 0x%x != 0x55AA*2",
+ buf.u32);
+ goto err;
+ }
+
+ close(fd);
+ return 0;
+err:
+ close(fd);
+ return -1;
+}
+
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+ enum bpf_attach_type attach_type;
+ enum bpf_prog_type prog_type;
+ struct bpf_program *prog;
+ int err;
+
+ err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+ if (err) {
+ log_err("Failed to deduct types for %s BPF program", title);
+ return -1;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, title);
+ if (!prog) {
+ log_err("Failed to find %s BPF program", title);
+ return -1;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
+ attach_type, 0);
+ if (err) {
+ log_err("Failed to attach %s BPF program", title);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int run_test(int cgroup_fd)
+{
+ struct bpf_prog_load_attr attr = {
+ .file = "./sockopt_sk.o",
+ };
+ struct bpf_object *obj;
+ int ignored;
+ int err;
+
+ err = bpf_prog_load_xattr(&attr, &obj, &ignored);
+ if (err) {
+ log_err("Failed to load BPF object");
+ return -1;
+ }
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
+ if (err)
+ goto close_bpf_object;
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
+ if (err)
+ goto close_bpf_object;
+
+ err = getsetsockopt();
+
+close_bpf_object:
+ bpf_object__close(obj);
+ return err;
+}
+
+int main(int args, char **argv)
+{
+ int cgroup_fd;
+ int err = EXIT_SUCCESS;
+
+ if (setup_cgroup_environment())
+ goto cleanup_obj;
+
+ cgroup_fd = create_and_get_cgroup(CG_PATH);
+ if (cgroup_fd < 0)
+ goto cleanup_cgroup_env;
+
+ if (join_cgroup(CG_PATH))
+ goto cleanup_cgroup;
+
+ if (run_test(cgroup_fd))
+ err = EXIT_FAILURE;
+
+ printf("test_sockopt_sk: %s\n",
+ err == EXIT_SUCCESS ? "PASSED" : "FAILED");
+
+cleanup_cgroup:
+ close(cgroup_fd);
+cleanup_cgroup_env:
+ cleanup_cgroup_environment();
+cleanup_obj:
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_tcp_rtt.c b/tools/testing/selftests/bpf/test_tcp_rtt.c
new file mode 100644
index 000000000000..90c3862f74a8
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcp_rtt.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pthread.h>
+
+#include <linux/filter.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH "/tcp_rtt"
+
+struct tcp_rtt_storage {
+ __u32 invoked;
+ __u32 dsack_dups;
+ __u32 delivered;
+ __u32 delivered_ce;
+ __u32 icsk_retransmits;
+};
+
+static void send_byte(int fd)
+{
+ char b = 0x55;
+
+ if (write(fd, &b, sizeof(b)) != 1)
+ error(1, errno, "Failed to send single byte");
+}
+
+static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
+ __u32 dsack_dups, __u32 delivered, __u32 delivered_ce,
+ __u32 icsk_retransmits)
+{
+ int err = 0;
+ struct tcp_rtt_storage val;
+
+ if (bpf_map_lookup_elem(map_fd, &client_fd, &val) < 0)
+ error(1, errno, "Failed to read socket storage");
+
+ if (val.invoked != invoked) {
+ log_err("%s: unexpected bpf_tcp_sock.invoked %d != %d",
+ msg, val.invoked, invoked);
+ err++;
+ }
+
+ if (val.dsack_dups != dsack_dups) {
+ log_err("%s: unexpected bpf_tcp_sock.dsack_dups %d != %d",
+ msg, val.dsack_dups, dsack_dups);
+ err++;
+ }
+
+ if (val.delivered != delivered) {
+ log_err("%s: unexpected bpf_tcp_sock.delivered %d != %d",
+ msg, val.delivered, delivered);
+ err++;
+ }
+
+ if (val.delivered_ce != delivered_ce) {
+ log_err("%s: unexpected bpf_tcp_sock.delivered_ce %d != %d",
+ msg, val.delivered_ce, delivered_ce);
+ err++;
+ }
+
+ if (val.icsk_retransmits != icsk_retransmits) {
+ log_err("%s: unexpected bpf_tcp_sock.icsk_retransmits %d != %d",
+ msg, val.icsk_retransmits, icsk_retransmits);
+ err++;
+ }
+
+ return err;
+}
+
+static int connect_to_server(int server_fd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create client socket");
+ return -1;
+ }
+
+ if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get server addr");
+ goto out;
+ }
+
+ if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
+ log_err("Fail to connect to server");
+ goto out;
+ }
+
+ return fd;
+
+out:
+ close(fd);
+ return -1;
+}
+
+static int run_test(int cgroup_fd, int server_fd)
+{
+ struct bpf_prog_load_attr attr = {
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+ .file = "./tcp_rtt.o",
+ .expected_attach_type = BPF_CGROUP_SOCK_OPS,
+ };
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ int client_fd;
+ int prog_fd;
+ int map_fd;
+ int err;
+
+ err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+ if (err) {
+ log_err("Failed to load BPF object");
+ return -1;
+ }
+
+ map = bpf_map__next(NULL, obj);
+ map_fd = bpf_map__fd(map);
+
+ err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_SOCK_OPS, 0);
+ if (err) {
+ log_err("Failed to attach BPF program");
+ goto close_bpf_object;
+ }
+
+ client_fd = connect_to_server(server_fd);
+ if (client_fd < 0) {
+ err = -1;
+ goto close_bpf_object;
+ }
+
+ err += verify_sk(map_fd, client_fd, "syn-ack",
+ /*invoked=*/1,
+ /*dsack_dups=*/0,
+ /*delivered=*/1,
+ /*delivered_ce=*/0,
+ /*icsk_retransmits=*/0);
+
+ send_byte(client_fd);
+
+ err += verify_sk(map_fd, client_fd, "first payload byte",
+ /*invoked=*/2,
+ /*dsack_dups=*/0,
+ /*delivered=*/2,
+ /*delivered_ce=*/0,
+ /*icsk_retransmits=*/0);
+
+ close(client_fd);
+
+close_bpf_object:
+ bpf_object__close(obj);
+ return err;
+}
+
+static int start_server(void)
+{
+ struct sockaddr_in addr = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+ };
+ int fd;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create server socket");
+ return -1;
+ }
+
+ if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ log_err("Failed to bind socket");
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+static void *server_thread(void *arg)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd = *(int *)arg;
+ int client_fd;
+
+ if (listen(fd, 1) < 0)
+ error(1, errno, "Failed to listed on socket");
+
+ client_fd = accept(fd, (struct sockaddr *)&addr, &len);
+ if (client_fd < 0)
+ error(1, errno, "Failed to accept client");
+
+ /* Wait for the next connection (that never arrives)
+ * to keep this thread alive to prevent calling
+ * close() on client_fd.
+ */
+ if (accept(fd, (struct sockaddr *)&addr, &len) >= 0)
+ error(1, errno, "Unexpected success in second accept");
+
+ close(client_fd);
+
+ return NULL;
+}
+
+int main(int args, char **argv)
+{
+ int server_fd, cgroup_fd;
+ int err = EXIT_SUCCESS;
+ pthread_t tid;
+
+ if (setup_cgroup_environment())
+ goto cleanup_obj;
+
+ cgroup_fd = create_and_get_cgroup(CG_PATH);
+ if (cgroup_fd < 0)
+ goto cleanup_cgroup_env;
+
+ if (join_cgroup(CG_PATH))
+ goto cleanup_cgroup;
+
+ server_fd = start_server();
+ if (server_fd < 0) {
+ err = EXIT_FAILURE;
+ goto cleanup_cgroup;
+ }
+
+ pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
+
+ if (run_test(cgroup_fd, server_fd))
+ err = EXIT_FAILURE;
+
+ close(server_fd);
+
+ printf("test_sockopt_sk: %s\n",
+ err == EXIT_SUCCESS ? "PASSED" : "FAILED");
+
+cleanup_cgroup:
+ close(cgroup_fd);
+cleanup_cgroup_env:
+ cleanup_cgroup_environment();
+cleanup_obj:
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh
new file mode 100755
index 000000000000..ba8ffcdaac30
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_veth.sh
@@ -0,0 +1,118 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Create 3 namespaces with 3 veth peers, and
+# forward packets in-between using native XDP
+#
+# XDP_TX
+# NS1(veth11) NS2(veth22) NS3(veth33)
+# | | |
+# | | |
+# (veth1, (veth2, (veth3,
+# id:111) id:122) id:133)
+# ^ | ^ | ^ |
+# | | XDP_REDIRECT | | XDP_REDIRECT | |
+# | ------------------ ------------------ |
+# -----------------------------------------
+# XDP_REDIRECT
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+TESTNAME=xdp_veth
+BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts)
+BPF_DIR=$BPF_FS/test_$TESTNAME
+
+_cleanup()
+{
+ set +e
+ ip link del veth1 2> /dev/null
+ ip link del veth2 2> /dev/null
+ ip link del veth3 2> /dev/null
+ ip netns del ns1 2> /dev/null
+ ip netns del ns2 2> /dev/null
+ ip netns del ns3 2> /dev/null
+ rm -rf $BPF_DIR 2> /dev/null
+}
+
+cleanup_skip()
+{
+ echo "selftests: $TESTNAME [SKIP]"
+ _cleanup
+
+ exit $ksft_skip
+}
+
+cleanup()
+{
+ if [ "$?" = 0 ]; then
+ echo "selftests: $TESTNAME [PASS]"
+ else
+ echo "selftests: $TESTNAME [FAILED]"
+ fi
+ _cleanup
+}
+
+if [ $(id -u) -ne 0 ]; then
+ echo "selftests: $TESTNAME [SKIP] Need root privileges"
+ exit $ksft_skip
+fi
+
+if ! ip link set dev lo xdp off > /dev/null 2>&1; then
+ echo "selftests: $TESTNAME [SKIP] Could not run test without the ip xdp support"
+ exit $ksft_skip
+fi
+
+if [ -z "$BPF_FS" ]; then
+ echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted"
+ exit $ksft_skip
+fi
+
+if ! bpftool version > /dev/null 2>&1; then
+ echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool"
+ exit $ksft_skip
+fi
+
+set -e
+
+trap cleanup_skip EXIT
+
+ip netns add ns1
+ip netns add ns2
+ip netns add ns3
+
+ip link add veth1 index 111 type veth peer name veth11 netns ns1
+ip link add veth2 index 122 type veth peer name veth22 netns ns2
+ip link add veth3 index 133 type veth peer name veth33 netns ns3
+
+ip link set veth1 up
+ip link set veth2 up
+ip link set veth3 up
+
+ip -n ns1 addr add 10.1.1.11/24 dev veth11
+ip -n ns3 addr add 10.1.1.33/24 dev veth33
+
+ip -n ns1 link set dev veth11 up
+ip -n ns2 link set dev veth22 up
+ip -n ns3 link set dev veth33 up
+
+mkdir $BPF_DIR
+bpftool prog loadall \
+ xdp_redirect_map.o $BPF_DIR/progs type xdp \
+ pinmaps $BPF_DIR/maps
+bpftool map update pinned $BPF_DIR/maps/tx_port key 0 0 0 0 value 122 0 0 0
+bpftool map update pinned $BPF_DIR/maps/tx_port key 1 0 0 0 value 133 0 0 0
+bpftool map update pinned $BPF_DIR/maps/tx_port key 2 0 0 0 value 111 0 0 0
+ip link set dev veth1 xdp pinned $BPF_DIR/progs/redirect_map_0
+ip link set dev veth2 xdp pinned $BPF_DIR/progs/redirect_map_1
+ip link set dev veth3 xdp pinned $BPF_DIR/progs/redirect_map_2
+
+ip -n ns1 link set dev veth11 xdp obj xdp_dummy.o sec xdp_dummy
+ip -n ns2 link set dev veth22 xdp obj xdp_tx.o sec tx
+ip -n ns3 link set dev veth33 xdp obj xdp_dummy.o sec xdp_dummy
+
+trap cleanup EXIT
+
+ip netns exec ns1 ping -c 1 -W 1 10.1.1.33
+
+exit 0