aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-05-15 10:43:52 -0700
committerDavid S. Miller <davem@davemloft.net>2020-05-15 10:43:52 -0700
commit3430223d393dd23734cc87177d704449cfc294a8 (patch)
tree8a82f0640e498d463d8b059aa0dabb27e5214b13
parentnet: dsa: mt7530: fix VLAN setup (diff)
parentMerge branch 'bpf-cap' (diff)
downloadwireguard-linux-3430223d393dd23734cc87177d704449cfc294a8.tar.xz
wireguard-linux-3430223d393dd23734cc87177d704449cfc294a8.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says: ==================== pull-request: bpf-next 2020-05-15 The following pull-request contains BPF updates for your *net-next* tree. We've added 37 non-merge commits during the last 1 day(s) which contain a total of 67 files changed, 741 insertions(+), 252 deletions(-). The main changes are: 1) bpf_xdp_adjust_tail() now allows to grow the tail as well, from Jesper. 2) bpftool can probe CONFIG_HZ, from Daniel. 3) CAP_BPF is introduced to isolate user processes that use BPF infra and to secure BPF networking services by dropping CAP_SYS_ADMIN requirement in certain cases, from Alexei. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/media/rc/bpf-lirc.c2
-rw-r--r--drivers/net/ethernet/amazon/ena/ena_netdev.c1
-rw-r--r--drivers/net/ethernet/amazon/ena/ena_netdev.h5
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c1
-rw-r--r--drivers/net/ethernet/cavium/thunder/nicvf_main.c1
-rw-r--r--drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c7
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_txrx.c30
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_xsk.c2
-rw-r--r--drivers/net/ethernet/intel/ice/ice_txrx.c34
-rw-r--r--drivers/net/ethernet/intel/ice/ice_xsk.c2
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c33
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c2
-rw-r--r--drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c34
-rw-r--r--drivers/net/ethernet/marvell/mvneta.c25
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_netdev.c3
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_rx.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c6
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_rx.c2
-rw-r--r--drivers/net/ethernet/netronome/nfp/nfp_net_common.c6
-rw-r--r--drivers/net/ethernet/qlogic/qede/qede_fp.c1
-rw-r--r--drivers/net/ethernet/qlogic/qede/qede_main.c2
-rw-r--r--drivers/net/ethernet/sfc/rx.c1
-rw-r--r--drivers/net/ethernet/socionext/netsec.c30
-rw-r--r--drivers/net/ethernet/ti/cpsw.c1
-rw-r--r--drivers/net/ethernet/ti/cpsw_new.c1
-rw-r--r--drivers/net/hyperv/netvsc_bpf.c1
-rw-r--r--drivers/net/hyperv/netvsc_drv.c2
-rw-r--r--drivers/net/tun.c2
-rw-r--r--drivers/net/veth.c28
-rw-r--r--drivers/net/virtio_net.c15
-rw-r--r--drivers/vhost/net.c1
-rw-r--r--include/linux/bpf.h18
-rw-r--r--include/linux/bpf_verifier.h3
-rw-r--r--include/linux/capability.h5
-rw-r--r--include/net/xdp.h27
-rw-r--r--include/net/xdp_sock.h11
-rw-r--r--include/uapi/linux/bpf.h4
-rw-r--r--include/uapi/linux/capability.h34
-rw-r--r--kernel/bpf/arraymap.c10
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/cpumap.c23
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/helpers.c4
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/map_in_map.c2
-rw-r--r--kernel/bpf/queue_stack_maps.c2
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/syscall.c89
-rw-r--r--kernel/bpf/verifier.c37
-rw-r--r--kernel/trace/bpf_trace.c3
-rw-r--r--net/bpf/test_run.c16
-rw-r--r--net/core/bpf_sk_storage.c4
-rw-r--r--net/core/dev.c14
-rw-r--r--net/core/filter.c19
-rw-r--r--net/core/xdp.c8
-rw-r--r--security/selinux/include/classmap.h4
-rw-r--r--tools/bpf/bpftool/feature.c120
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c123
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c33
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c (renamed from tools/testing/selftests/bpf/progs/test_adjust_tail.c)12
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c44
-rw-r--r--tools/testing/selftests/bpf/verifier/calls.c16
-rw-r--r--tools/testing/selftests/bpf/verifier/dead_code.c10
67 files changed, 741 insertions, 252 deletions
diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
index 069c42f22a8c..5bb144435c16 100644
--- a/drivers/media/rc/bpf-lirc.c
+++ b/drivers/media/rc/bpf-lirc.c
@@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
+ if (perfmon_capable())
return bpf_get_trace_printk_proto();
/* fall through */
default:
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 2818965427e9..85b87ed02dd5 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -1606,6 +1606,7 @@ static int ena_clean_rx_irq(struct ena_ring *rx_ring, struct napi_struct *napi,
"%s qid %d\n", __func__, rx_ring->qid);
res_budget = budget;
xdp.rxq = &rx_ring->xdp_rxq;
+ xdp.frame_sz = ENA_PAGE_SIZE;
do {
xdp_verdict = XDP_PASS;
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index 7df67bf09b93..680099afcccf 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -151,8 +151,9 @@
* The buffer size we share with the device is defined to be ENA_PAGE_SIZE
*/
-#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \
- VLAN_HLEN - XDP_PACKET_HEADROOM)
+#define ENA_XDP_MAX_MTU (ENA_PAGE_SIZE - ETH_HLEN - ETH_FCS_LEN - \
+ VLAN_HLEN - XDP_PACKET_HEADROOM - \
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
#define ENA_IS_XDP_INDEX(adapter, index) (((index) >= (adapter)->xdp_first_ring) && \
((index) < (adapter)->xdp_first_ring + (adapter)->xdp_num_queues))
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index c6f6f2033880..5e3b4a3b69ea 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -138,6 +138,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = *data_ptr + *len;
xdp.rxq = &rxr->xdp_rxq;
+ xdp.frame_sz = PAGE_SIZE; /* BNXT_RX_PAGE_MODE(bp) when XDP enabled */
orig_data = xdp.data;
rcu_read_lock();
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index b4b33368698f..2ba0ce115e63 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -552,6 +552,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &rq->xdp_rxq;
+ xdp.frame_sz = RCV_FRAG_LEN + XDP_PACKET_HEADROOM;
orig_data = xdp.data;
rcu_read_lock();
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 0f3e842a4fd6..8c8d95aa1dfd 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -331,6 +331,9 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
xdp_set_data_meta_invalid(&xdp);
xdp.rxq = &ch->xdp_rxq;
+ xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE -
+ (dpaa2_fd_get_offset(fd) - XDP_PACKET_HEADROOM);
+
xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp);
/* xdp.data pointer may have changed */
@@ -366,7 +369,11 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv,
dma_unmap_page(priv->net_dev->dev.parent, addr,
DPAA2_ETH_RX_BUF_SIZE, DMA_BIDIRECTIONAL);
ch->buf_count--;
+
+ /* Allow redirect use of full headroom */
xdp.data_hard_start = vaddr;
+ xdp.frame_sz = DPAA2_ETH_RX_BUF_RAW_SIZE;
+
err = xdp_do_redirect(priv->net_dev, &xdp, xdp_prog);
if (unlikely(err))
ch->stats.xdp_drop++;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index b8496037ef7f..a3772beffe02 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1507,6 +1507,22 @@ static inline unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
}
+static unsigned int i40e_rx_frame_truesize(struct i40e_ring *rx_ring,
+ unsigned int size)
+{
+ unsigned int truesize;
+
+#if (PAGE_SIZE < 8192)
+ truesize = i40e_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
+#else
+ truesize = i40e_rx_offset(rx_ring) ?
+ SKB_DATA_ALIGN(size + i40e_rx_offset(rx_ring)) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+ SKB_DATA_ALIGN(size);
+#endif
+ return truesize;
+}
+
/**
* i40e_alloc_mapped_page - recycle or make a new page
* @rx_ring: ring to use
@@ -2246,13 +2262,11 @@ static void i40e_rx_buffer_flip(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *rx_buffer,
unsigned int size)
{
-#if (PAGE_SIZE < 8192)
- unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2;
+ unsigned int truesize = i40e_rx_frame_truesize(rx_ring, size);
+#if (PAGE_SIZE < 8192)
rx_buffer->page_offset ^= truesize;
#else
- unsigned int truesize = SKB_DATA_ALIGN(i40e_rx_offset(rx_ring) + size);
-
rx_buffer->page_offset += truesize;
#endif
}
@@ -2335,6 +2349,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
bool failure = false;
struct xdp_buff xdp;
+#if (PAGE_SIZE < 8192)
+ xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0);
+#endif
xdp.rxq = &rx_ring->xdp_rxq;
while (likely(total_rx_packets < (unsigned int)budget)) {
@@ -2389,7 +2406,10 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
xdp.data_hard_start = xdp.data -
i40e_rx_offset(rx_ring);
xdp.data_end = xdp.data + size;
-
+#if (PAGE_SIZE > 4096)
+ /* At larger PAGE_SIZE, frame_sz depend on len size */
+ xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, size);
+#endif
skb = i40e_run_xdp(rx_ring, &xdp);
}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 0b7d29192b2c..2b9184aead5f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -531,12 +531,14 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
+ struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_res, xdp_xmit = 0;
bool failure = false;
struct sk_buff *skb;
struct xdp_buff xdp;
xdp.rxq = &rx_ring->xdp_rxq;
+ xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < (unsigned int)budget)) {
struct i40e_rx_buffer *bi;
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index f67e8362958c..69b21b436f9a 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -423,6 +423,22 @@ static unsigned int ice_rx_offset(struct ice_ring *rx_ring)
return 0;
}
+static unsigned int ice_rx_frame_truesize(struct ice_ring *rx_ring,
+ unsigned int size)
+{
+ unsigned int truesize;
+
+#if (PAGE_SIZE < 8192)
+ truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
+#else
+ truesize = ice_rx_offset(rx_ring) ?
+ SKB_DATA_ALIGN(ice_rx_offset(rx_ring) + size) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+ SKB_DATA_ALIGN(size);
+#endif
+ return truesize;
+}
+
/**
* ice_run_xdp - Executes an XDP program on initialized xdp_buff
* @rx_ring: Rx ring
@@ -991,6 +1007,10 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
bool failure;
xdp.rxq = &rx_ring->xdp_rxq;
+ /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
+#if (PAGE_SIZE < 8192)
+ xdp.frame_sz = ice_rx_frame_truesize(rx_ring, 0);
+#endif
/* start the loop to process Rx packets bounded by 'budget' */
while (likely(total_rx_pkts < (unsigned int)budget)) {
@@ -1038,6 +1058,10 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
xdp.data_hard_start = xdp.data - ice_rx_offset(rx_ring);
xdp.data_meta = xdp.data;
xdp.data_end = xdp.data + size;
+#if (PAGE_SIZE > 4096)
+ /* At larger PAGE_SIZE, frame_sz depend on len size */
+ xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size);
+#endif
rcu_read_lock();
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
@@ -1051,16 +1075,8 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
if (!xdp_res)
goto construct_skb;
if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
- unsigned int truesize;
-
-#if (PAGE_SIZE < 8192)
- truesize = ice_rx_pg_size(rx_ring) / 2;
-#else
- truesize = SKB_DATA_ALIGN(ice_rx_offset(rx_ring) +
- size);
-#endif
xdp_xmit |= xdp_res;
- ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
+ ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz);
} else {
rx_buf->pagecnt_bias++;
}
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 8279db15e870..23e5515d4527 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -840,11 +840,13 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
+ struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_xmit = 0;
bool failure = false;
struct xdp_buff xdp;
xdp.rxq = &rx_ring->xdp_rxq;
+ xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < (unsigned int)budget)) {
union ice_32b_rx_flex_desc *rx_desc;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 718931d951bc..eab5934b04f5 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2244,19 +2244,30 @@ xdp_out:
return ERR_PTR(-result);
}
+static unsigned int ixgbe_rx_frame_truesize(struct ixgbe_ring *rx_ring,
+ unsigned int size)
+{
+ unsigned int truesize;
+
+#if (PAGE_SIZE < 8192)
+ truesize = ixgbe_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
+#else
+ truesize = ring_uses_build_skb(rx_ring) ?
+ SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+ SKB_DATA_ALIGN(size);
+#endif
+ return truesize;
+}
+
static void ixgbe_rx_buffer_flip(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *rx_buffer,
unsigned int size)
{
+ unsigned int truesize = ixgbe_rx_frame_truesize(rx_ring, size);
#if (PAGE_SIZE < 8192)
- unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
-
rx_buffer->page_offset ^= truesize;
#else
- unsigned int truesize = ring_uses_build_skb(rx_ring) ?
- SKB_DATA_ALIGN(IXGBE_SKB_PAD + size) :
- SKB_DATA_ALIGN(size);
-
rx_buffer->page_offset += truesize;
#endif
}
@@ -2290,6 +2301,11 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
xdp.rxq = &rx_ring->xdp_rxq;
+ /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
+#if (PAGE_SIZE < 8192)
+ xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, 0);
+#endif
+
while (likely(total_rx_packets < budget)) {
union ixgbe_adv_rx_desc *rx_desc;
struct ixgbe_rx_buffer *rx_buffer;
@@ -2323,7 +2339,10 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
xdp.data_hard_start = xdp.data -
ixgbe_rx_offset(rx_ring);
xdp.data_end = xdp.data + size;
-
+#if (PAGE_SIZE > 4096)
+ /* At larger PAGE_SIZE, frame_sz depend on len size */
+ xdp.frame_sz = ixgbe_rx_frame_truesize(rx_ring, size);
+#endif
skb = ixgbe_run_xdp(adapter, rx_ring, &xdp);
}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index 74b540ebb3dc..a656ee9a1fae 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -431,12 +431,14 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
struct ixgbe_adapter *adapter = q_vector->adapter;
u16 cleaned_count = ixgbe_desc_unused(rx_ring);
+ struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_res, xdp_xmit = 0;
bool failure = false;
struct sk_buff *skb;
struct xdp_buff xdp;
xdp.rxq = &rx_ring->xdp_rxq;
+ xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < budget)) {
union ixgbe_adv_rx_desc *rx_desc;
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 4622c4ea2e46..a39e2cb384dd 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -1095,19 +1095,31 @@ xdp_out:
return ERR_PTR(-result);
}
+static unsigned int ixgbevf_rx_frame_truesize(struct ixgbevf_ring *rx_ring,
+ unsigned int size)
+{
+ unsigned int truesize;
+
+#if (PAGE_SIZE < 8192)
+ truesize = ixgbevf_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
+#else
+ truesize = ring_uses_build_skb(rx_ring) ?
+ SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
+ SKB_DATA_ALIGN(size);
+#endif
+ return truesize;
+}
+
static void ixgbevf_rx_buffer_flip(struct ixgbevf_ring *rx_ring,
struct ixgbevf_rx_buffer *rx_buffer,
unsigned int size)
{
-#if (PAGE_SIZE < 8192)
- unsigned int truesize = ixgbevf_rx_pg_size(rx_ring) / 2;
+ unsigned int truesize = ixgbevf_rx_frame_truesize(rx_ring, size);
+#if (PAGE_SIZE < 8192)
rx_buffer->page_offset ^= truesize;
#else
- unsigned int truesize = ring_uses_build_skb(rx_ring) ?
- SKB_DATA_ALIGN(IXGBEVF_SKB_PAD + size) :
- SKB_DATA_ALIGN(size);
-
rx_buffer->page_offset += truesize;
#endif
}
@@ -1125,6 +1137,11 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
xdp.rxq = &rx_ring->xdp_rxq;
+ /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
+#if (PAGE_SIZE < 8192)
+ xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, 0);
+#endif
+
while (likely(total_rx_packets < budget)) {
struct ixgbevf_rx_buffer *rx_buffer;
union ixgbe_adv_rx_desc *rx_desc;
@@ -1157,7 +1174,10 @@ static int ixgbevf_clean_rx_irq(struct ixgbevf_q_vector *q_vector,
xdp.data_hard_start = xdp.data -
ixgbevf_rx_offset(rx_ring);
xdp.data_end = xdp.data + size;
-
+#if (PAGE_SIZE > 4096)
+ /* At larger PAGE_SIZE, frame_sz depend on len size */
+ xdp.frame_sz = ixgbevf_rx_frame_truesize(rx_ring, size);
+#endif
skb = ixgbevf_run_xdp(adapter, rx_ring, &xdp);
}
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index e0e9e56830c0..41d2a0eac5fa 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -2148,12 +2148,17 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
struct bpf_prog *prog, struct xdp_buff *xdp,
struct mvneta_stats *stats)
{
- unsigned int len;
+ unsigned int len, sync;
+ struct page *page;
u32 ret, act;
len = xdp->data_end - xdp->data_hard_start - pp->rx_offset_correction;
act = bpf_prog_run_xdp(prog, xdp);
+ /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
+ sync = xdp->data_end - xdp->data_hard_start - pp->rx_offset_correction;
+ sync = max(sync, len);
+
switch (act) {
case XDP_PASS:
stats->xdp_pass++;
@@ -2164,9 +2169,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
err = xdp_do_redirect(pp->dev, xdp, prog);
if (unlikely(err)) {
ret = MVNETA_XDP_DROPPED;
- page_pool_put_page(rxq->page_pool,
- virt_to_head_page(xdp->data), len,
- true);
+ page = virt_to_head_page(xdp->data);
+ page_pool_put_page(rxq->page_pool, page, sync, true);
} else {
ret = MVNETA_XDP_REDIR;
stats->xdp_redirect++;
@@ -2175,10 +2179,10 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
}
case XDP_TX:
ret = mvneta_xdp_xmit_back(pp, xdp);
- if (ret != MVNETA_XDP_TX)
- page_pool_put_page(rxq->page_pool,
- virt_to_head_page(xdp->data), len,
- true);
+ if (ret != MVNETA_XDP_TX) {
+ page = virt_to_head_page(xdp->data);
+ page_pool_put_page(rxq->page_pool, page, sync, true);
+ }
break;
default:
bpf_warn_invalid_xdp_action(act);
@@ -2187,8 +2191,8 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq,
trace_xdp_exception(pp->dev, prog, act);
/* fall through */
case XDP_DROP:
- page_pool_put_page(rxq->page_pool,
- virt_to_head_page(xdp->data), len, true);
+ page = virt_to_head_page(xdp->data);
+ page_pool_put_page(rxq->page_pool, page, sync, true);
ret = MVNETA_XDP_DROPPED;
stats->xdp_drop++;
break;
@@ -2320,6 +2324,7 @@ static int mvneta_rx_swbm(struct napi_struct *napi,
rcu_read_lock();
xdp_prog = READ_ONCE(pp->xdp_prog);
xdp_buf.rxq = &rxq->xdp_rxq;
+ xdp_buf.frame_sz = PAGE_SIZE;
/* Fairness NAPI loop */
while (rx_proc < budget && rx_proc < rx_todo) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 43dcbd8214c6..5bd3cd37d50f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -51,7 +51,8 @@
#include "en_port.h"
#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
- XDP_PACKET_HEADROOM))
+ XDP_PACKET_HEADROOM - \
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))))
int mlx4_en_setup_tc(struct net_device *dev, u8 up)
{
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 787139219813..8a10285b0e10 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -683,6 +683,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
rcu_read_lock();
xdp_prog = rcu_dereference(ring->xdp_prog);
xdp.rxq = &ring->xdp_rxq;
+ xdp.frame_sz = priv->frag_info[0].frag_stride;
doorbell_pending = 0;
/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3bd64c63865b..26911b15f8fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -625,6 +625,7 @@ struct mlx5e_rq {
struct {
u16 umem_headroom;
u16 headroom;
+ u32 frame0_sz;
u8 map_dir; /* dma map direction */
} buff;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index c4a7fb4ecd14..761c8979bd41 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -137,6 +137,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
if (xsk)
xdp.handle = di->xsk.handle;
xdp.rxq = &rq->xdp_rxq;
+ xdp.frame_sz = rq->buff.frame0_sz;
act = bpf_prog_run_xdp(prog, &xdp);
if (xsk) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 0a9dfc31de3e..0e4ca08ddca9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -462,6 +462,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->mpwqe.num_strides =
BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk));
+ rq->buff.frame0_sz = (1 << rq->mpwqe.log_stride_sz);
+
err = mlx5e_create_rq_umr_mkey(mdev, rq);
if (err)
goto err_rq_wq_destroy;
@@ -485,6 +487,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags;
rq->wqe.info = rqp->frags_info;
+ rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride;
+
rq->wqe.frags =
kvzalloc_node(array_size(sizeof(*rq->wqe.frags),
(wq_sz << rq->wqe.info.log_num_frags)),
@@ -522,6 +526,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
}
if (xsk) {
+ rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem);
+
err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames);
if (unlikely(err)) {
mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 779600bebcca..821f94beda7a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1070,6 +1070,7 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
if (consumed)
return NULL; /* page/packet was consumed by XDP */
+ frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
if (unlikely(!skb))
return NULL;
@@ -1371,6 +1372,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
return NULL; /* page/packet was consumed by XDP */
}
+ frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32);
if (unlikely(!skb))
return NULL;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 9bfb3b077bc1..0e0cc3d58bdc 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1741,10 +1741,15 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
struct nfp_net_rx_buf *rxbuf, unsigned int dma_off,
unsigned int pkt_len, bool *completed)
{
+ unsigned int dma_map_sz = dp->fl_bufsz - NFP_NET_RX_BUF_NON_DATA;
struct nfp_net_tx_buf *txbuf;
struct nfp_net_tx_desc *txd;
int wr_idx;
+ /* Reject if xdp_adjust_tail grow packet beyond DMA area */
+ if (pkt_len + dma_off > dma_map_sz)
+ return false;
+
if (unlikely(nfp_net_tx_full(tx_ring, 1))) {
if (!*completed) {
nfp_net_xdp_complete(tx_ring);
@@ -1817,6 +1822,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
rcu_read_lock();
xdp_prog = READ_ONCE(dp->xdp_prog);
true_bufsz = xdp_prog ? PAGE_SIZE : dp->fl_bufsz;
+ xdp.frame_sz = PAGE_SIZE - NFP_NET_RX_BUF_HEADROOM;
xdp.rxq = &rx_ring->xdp_rxq;
tx_ring = r_vec->xdp_ring;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index c6c20776b474..7598ebe0962a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1066,6 +1066,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + *len;
xdp.rxq = &rxq->xdp_rxq;
+ xdp.frame_sz = rxq->rx_buf_seg_size; /* PAGE_SIZE when XDP enabled */
/* Queues always have a full reset currently, so for the time
* being until there's atomic program replace just mark read
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index f50d9a9b76be..b2d154258b07 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1476,7 +1476,7 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq)
if (rxq->rx_buf_size + size > PAGE_SIZE)
rxq->rx_buf_size = PAGE_SIZE - size;
- /* Segment size to spilt a page in multiple equal parts ,
+ /* Segment size to split a page in multiple equal parts,
* unless XDP is used in which case we'd use the entire page.
*/
if (!edev->xdp_prog) {
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 260352d97d9d..68c47a8c71df 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -308,6 +308,7 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + rx_buf->len;
xdp.rxq = &rx_queue->xdp_rxq_info;
+ xdp.frame_sz = efx->rx_page_buf_step;
xdp_act = bpf_prog_run_xdp(xdp_prog, &xdp);
rcu_read_unlock();
diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index a5a0fb60193a..e1f4be4b3d69 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -884,23 +884,28 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog,
struct xdp_buff *xdp)
{
struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
- unsigned int len = xdp->data_end - xdp->data;
+ unsigned int sync, len = xdp->data_end - xdp->data;
u32 ret = NETSEC_XDP_PASS;
+ struct page *page;
int err;
u32 act;
act = bpf_prog_run_xdp(prog, xdp);
+ /* Due xdp_adjust_tail: DMA sync for_device cover max len CPU touch */
+ sync = xdp->data_end - xdp->data_hard_start - NETSEC_RXBUF_HEADROOM;
+ sync = max(sync, len);
+
switch (act) {
case XDP_PASS:
ret = NETSEC_XDP_PASS;
break;
case XDP_TX:
ret = netsec_xdp_xmit_back(priv, xdp);
- if (ret != NETSEC_XDP_TX)
- page_pool_put_page(dring->page_pool,
- virt_to_head_page(xdp->data), len,
- true);
+ if (ret != NETSEC_XDP_TX) {
+ page = virt_to_head_page(xdp->data);
+ page_pool_put_page(dring->page_pool, page, sync, true);
+ }
break;
case XDP_REDIRECT:
err = xdp_do_redirect(priv->ndev, xdp, prog);
@@ -908,9 +913,8 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog,
ret = NETSEC_XDP_REDIR;
} else {
ret = NETSEC_XDP_CONSUMED;
- page_pool_put_page(dring->page_pool,
- virt_to_head_page(xdp->data), len,
- true);
+ page = virt_to_head_page(xdp->data);
+ page_pool_put_page(dring->page_pool, page, sync, true);
}
break;
default:
@@ -921,8 +925,8 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog,
/* fall through -- handle aborts by dropping packet */
case XDP_DROP:
ret = NETSEC_XDP_CONSUMED;
- page_pool_put_page(dring->page_pool,
- virt_to_head_page(xdp->data), len, true);
+ page = virt_to_head_page(xdp->data);
+ page_pool_put_page(dring->page_pool, page, sync, true);
break;
}
@@ -936,10 +940,14 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
struct netsec_rx_pkt_info rx_info;
enum dma_data_direction dma_dir;
struct bpf_prog *xdp_prog;
+ struct xdp_buff xdp;
u16 xdp_xmit = 0;
u32 xdp_act = 0;
int done = 0;
+ xdp.rxq = &dring->xdp_rxq;
+ xdp.frame_sz = PAGE_SIZE;
+
rcu_read_lock();
xdp_prog = READ_ONCE(priv->xdp_prog);
dma_dir = page_pool_get_dma_dir(dring->page_pool);
@@ -953,7 +961,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
struct sk_buff *skb = NULL;
u16 pkt_len, desc_len;
dma_addr_t dma_handle;
- struct xdp_buff xdp;
void *buf_addr;
if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD)) {
@@ -1002,7 +1009,6 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
xdp.data = desc->addr + NETSEC_RXBUF_HEADROOM;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + pkt_len;
- xdp.rxq = &dring->xdp_rxq;
if (xdp_prog) {
xdp_result = netsec_run_xdp(priv, xdp_prog, &xdp);
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 09f98fa2fb4e..ce0645ada6e7 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -406,6 +406,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
xdp.data_hard_start = pa;
xdp.rxq = &priv->xdp_rxq[ch];
+ xdp.frame_sz = PAGE_SIZE;
port = priv->emac_port + cpsw->data.dual_emac;
ret = cpsw_run_xdp(priv, ch, &xdp, page, port);
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index dce49311d3d3..1247d35d42ef 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -348,6 +348,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
xdp.data_hard_start = pa;
xdp.rxq = &priv->xdp_rxq[ch];
+ xdp.frame_sz = PAGE_SIZE;
ret = cpsw_run_xdp(priv, ch, &xdp, page, priv->emac_port);
if (ret != CPSW_XDP_PASS)
diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c
index b86611041db6..1e0c024b0a93 100644
--- a/drivers/net/hyperv/netvsc_bpf.c
+++ b/drivers/net/hyperv/netvsc_bpf.c
@@ -49,6 +49,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
xdp_set_data_meta_invalid(xdp);
xdp->data_end = xdp->data + len;
xdp->rxq = &nvchan->xdp_rxq;
+ xdp->frame_sz = PAGE_SIZE;
xdp->handle = 0;
memcpy(xdp->data, data, len);
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5de57fc3ec60..6267f706e8ee 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -795,7 +795,7 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
if (xbuf) {
unsigned int hdroom = xdp->data - xdp->data_hard_start;
unsigned int xlen = xdp->data_end - xdp->data;
- unsigned int frag_size = netvsc_xdp_fraglen(hdroom + xlen);
+ unsigned int frag_size = xdp->frame_sz;
skb = build_skb(xbuf, frag_size);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 44889eba1dbc..c54f967e2c66 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1671,6 +1671,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &tfile->xdp_rxq;
+ xdp.frame_sz = buflen;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
if (act == XDP_REDIRECT || act == XDP_TX) {
@@ -2411,6 +2412,7 @@ static int tun_xdp_one(struct tun_struct *tun,
}
xdp_set_data_meta_invalid(xdp);
xdp->rxq = &tfile->xdp_rxq;
+ xdp->frame_sz = buflen;
act = bpf_prog_run_xdp(xdp_prog, xdp);
err = tun_xdp_act(tun, xdp_prog, xdp, act);
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index aece0e5eec8c..b586d2fa5551 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -405,10 +405,6 @@ static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
{
struct sk_buff *skb;
- if (!buflen) {
- buflen = SKB_DATA_ALIGN(headroom + len) +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- }
skb = build_skb(head, buflen);
if (!skb)
return NULL;
@@ -564,13 +560,15 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
struct veth_stats *stats)
{
void *hard_start = frame->data - frame->headroom;
- void *head = hard_start - sizeof(struct xdp_frame);
int len = frame->len, delta = 0;
struct xdp_frame orig_frame;
struct bpf_prog *xdp_prog;
unsigned int headroom;
struct sk_buff *skb;
+ /* bpf_xdp_adjust_head() assures BPF cannot access xdp_frame area */
+ hard_start -= sizeof(struct xdp_frame);
+
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (likely(xdp_prog)) {
@@ -581,6 +579,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
xdp.data = frame->data;
xdp.data_end = frame->data + frame->len;
xdp.data_meta = frame->data - frame->metasize;
+ xdp.frame_sz = frame->frame_sz;
xdp.rxq = &rq->xdp_rxq;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -592,7 +591,6 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
break;
case XDP_TX:
orig_frame = *frame;
- xdp.data_hard_start = head;
xdp.rxq->mem = frame->mem;
if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
trace_xdp_exception(rq->dev, xdp_prog, act);
@@ -605,7 +603,6 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
goto xdp_xmit;
case XDP_REDIRECT:
orig_frame = *frame;
- xdp.data_hard_start = head;
xdp.rxq->mem = frame->mem;
if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
frame = &orig_frame;
@@ -629,7 +626,7 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
rcu_read_unlock();
headroom = sizeof(struct xdp_frame) + frame->headroom - delta;
- skb = veth_build_skb(head, headroom, len, 0);
+ skb = veth_build_skb(hard_start, headroom, len, frame->frame_sz);
if (!skb) {
xdp_return_frame(frame);
stats->rx_drops++;
@@ -695,9 +692,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
goto drop;
}
- nskb = veth_build_skb(head,
- VETH_XDP_HEADROOM + mac_len, skb->len,
- PAGE_SIZE);
+ nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len,
+ skb->len, PAGE_SIZE);
if (!nskb) {
page_frag_free(head);
goto drop;
@@ -715,6 +711,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
xdp.data_end = xdp.data + pktlen;
xdp.data_meta = xdp.data;
xdp.rxq = &rq->xdp_rxq;
+
+ /* SKB "head" area always have tailroom for skb_shared_info */
+ xdp.frame_sz = (void *)skb_end_pointer(skb) - xdp.data_hard_start;
+ xdp.frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
orig_data = xdp.data;
orig_data_end = xdp.data_end;
@@ -758,6 +759,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
}
rcu_read_unlock();
+ /* check if bpf_xdp_adjust_head was used */
delta = orig_data - xdp.data;
off = mac_len + delta;
if (off > 0)
@@ -765,9 +767,11 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
else if (off < 0)
__skb_pull(skb, -off);
skb->mac_header -= delta;
+
+ /* check if bpf_xdp_adjust_tail was used */
off = xdp.data_end - orig_data_end;
if (off != 0)
- __skb_put(skb, off);
+ __skb_put(skb, off); /* positive on grow, negative on shrink */
skb->protocol = eth_type_trans(skb, rq->dev);
metalen = xdp.data - xdp.data_meta;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 11f722460513..9e1b5d748586 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -689,6 +689,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdp.data_end = xdp.data + len;
xdp.data_meta = xdp.data;
xdp.rxq = &rq->xdp_rxq;
+ xdp.frame_sz = buflen;
orig_data = xdp.data;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
stats->xdp_packets++;
@@ -797,10 +798,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
int offset = buf - page_address(page);
struct sk_buff *head_skb, *curr_skb;
struct bpf_prog *xdp_prog;
- unsigned int truesize;
+ unsigned int truesize = mergeable_ctx_to_truesize(ctx);
unsigned int headroom = mergeable_ctx_to_headroom(ctx);
- int err;
unsigned int metasize = 0;
+ unsigned int frame_sz;
+ int err;
head_skb = NULL;
stats->bytes += len - vi->hdr_len;
@@ -821,6 +823,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
if (unlikely(hdr->hdr.gso_type))
goto err_xdp;
+ /* Buffers with headroom use PAGE_SIZE as alloc size,
+ * see add_recvbuf_mergeable() + get_mergeable_buf_len()
+ */
+ frame_sz = headroom ? PAGE_SIZE : truesize;
+
/* This happens when rx buffer size is underestimated
* or headroom is not enough because of the buffer
* was refilled before XDP is set. This should only
@@ -834,6 +841,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
page, offset,
VIRTIO_XDP_HEADROOM,
&len);
+ frame_sz = PAGE_SIZE;
+
if (!xdp_page)
goto err_xdp;
offset = VIRTIO_XDP_HEADROOM;
@@ -850,6 +859,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
xdp.data_end = xdp.data + (len - vi->hdr_len);
xdp.data_meta = xdp.data;
xdp.rxq = &rq->xdp_rxq;
+ xdp.frame_sz = frame_sz - vi->hdr_len;
act = bpf_prog_run_xdp(xdp_prog, &xdp);
stats->xdp_packets++;
@@ -924,7 +934,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
}
rcu_read_unlock();
- truesize = mergeable_ctx_to_truesize(ctx);
if (unlikely(len > truesize)) {
pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
dev->name, len, (unsigned long)ctx);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2927f02cc7e1..516519dcc8ff 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -747,6 +747,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
xdp->data = buf + pad;
xdp->data_end = xdp->data + len;
hdr->buflen = buflen;
+ xdp->frame_sz = buflen;
--net->refcnt_bias;
alloc_frag->offset += buflen;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c45d198ac38c..efe8836b5c48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
+#include <linux/capability.h>
struct bpf_verifier_env;
struct bpf_verifier_log;
@@ -119,7 +120,7 @@ struct bpf_map {
struct bpf_map_memory memory;
char name[BPF_OBJ_NAME_LEN];
u32 btf_vmlinux_value_type_id;
- bool unpriv_array;
+ bool bypass_spec_v1;
bool frozen; /* write-once; write-protected by freeze_mutex */
/* 22 bytes hole */
@@ -1095,6 +1096,21 @@ struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
extern int sysctl_unprivileged_bpf_disabled;
+static inline bool bpf_allow_ptr_leaks(void)
+{
+ return perfmon_capable();
+}
+
+static inline bool bpf_bypass_spec_v1(void)
+{
+ return perfmon_capable();
+}
+
+static inline bool bpf_bypass_spec_v4(void)
+{
+ return perfmon_capable();
+}
+
int bpf_map_new_fd(struct bpf_map *map, int flags);
int bpf_prog_new_fd(struct bpf_prog *prog);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 6abd5a778fcd..ea833087e853 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -375,6 +375,9 @@ struct bpf_verifier_env {
u32 used_map_cnt; /* number of used maps */
u32 id_gen; /* used to generate unique reg IDs */
bool allow_ptr_leaks;
+ bool bpf_capable;
+ bool bypass_spec_v1;
+ bool bypass_spec_v4;
bool seen_direct_write;
struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
const struct bpf_line_info *prev_linfo;
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 027d7e4a853b..b4345b38a6be 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -256,6 +256,11 @@ static inline bool perfmon_capable(void)
return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN);
}
+static inline bool bpf_capable(void)
+{
+ return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
+}
+
/* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 3cc6d5d84aa4..3094fccf5a88 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -6,6 +6,8 @@
#ifndef __LINUX_NET_XDP_H__
#define __LINUX_NET_XDP_H__
+#include <linux/skbuff.h> /* skb_shared_info */
+
/**
* DOC: XDP RX-queue information
*
@@ -70,13 +72,25 @@ struct xdp_buff {
void *data_hard_start;
unsigned long handle;
struct xdp_rxq_info *rxq;
+ u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
};
+/* Reserve memory area at end-of data area.
+ *
+ * This macro reserves tailroom in the XDP buffer by limiting the
+ * XDP/BPF data access to data_hard_end. Notice same area (and size)
+ * is used for XDP_PASS, when constructing the SKB via build_skb().
+ */
+#define xdp_data_hard_end(xdp) \
+ ((xdp)->data_hard_start + (xdp)->frame_sz - \
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
struct xdp_frame {
void *data;
u16 len;
u16 headroom;
- u16 metasize;
+ u32 metasize:8;
+ u32 frame_sz:24;
/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
* while mem info is valid on remote CPU.
*/
@@ -91,6 +105,10 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame)
frame->dev_rx = NULL;
}
+/* Avoids inlining WARN macro in fast-path */
+void xdp_warn(const char *msg, const char *func, const int line);
+#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__)
+
struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
/* Convert xdp_buff to xdp_frame */
@@ -111,6 +129,12 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
return NULL;
+ /* Catch if driver didn't reserve tailroom for skb_shared_info */
+ if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
+ XDP_WARN("Driver BUG: missing reserved tailroom");
+ return NULL;
+ }
+
/* Store info in top of packet */
xdp_frame = xdp->data_hard_start;
@@ -118,6 +142,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
xdp_frame->len = xdp->data_end - xdp->data;
xdp_frame->headroom = headroom - sizeof(*xdp_frame);
xdp_frame->metasize = metasize;
+ xdp_frame->frame_sz = xdp->frame_sz;
/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
xdp_frame->mem = xdp->rxq->mem;
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 67191ccaab85..abd72de25fa4 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -236,6 +236,12 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
else
return address + offset;
}
+
+static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
+{
+ return umem->chunk_size_nohr + umem->headroom;
+}
+
#else
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
@@ -366,6 +372,11 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
return 0;
}
+static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
+{
+ return 0;
+}
+
static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
{
return -EOPNOTSUPP;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 32cbf36c7729..b9b8a0f63b91 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2015,8 +2015,8 @@ union bpf_attr {
* int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
- * only possible to shrink the packet as of this writing,
- * therefore *delta* must be a negative integer.
+ * possible to both shrink and grow the packet tail.
+ * Shrink done via *delta* being a negative integer.
*
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index e58c9636741b..c7372180a0a9 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -274,6 +274,7 @@ struct vfs_ns_cap_data {
arbitrary SCSI commands */
/* Allow setting encryption key on loopback filesystem */
/* Allow setting zone reclaim policy */
+/* Allow everything under CAP_BPF and CAP_PERFMON for backward compatibility */
#define CAP_SYS_ADMIN 21
@@ -374,7 +375,38 @@ struct vfs_ns_cap_data {
#define CAP_PERFMON 38
-#define CAP_LAST_CAP CAP_PERFMON
+/*
+ * CAP_BPF allows the following BPF operations:
+ * - Creating all types of BPF maps
+ * - Advanced verifier features
+ * - Indirect variable access
+ * - Bounded loops
+ * - BPF to BPF function calls
+ * - Scalar precision tracking
+ * - Larger complexity limits
+ * - Dead code elimination
+ * - And potentially other features
+ * - Loading BPF Type Format (BTF) data
+ * - Retrieve xlated and JITed code of BPF programs
+ * - Use bpf_spin_lock() helper
+ *
+ * CAP_PERFMON relaxes the verifier checks further:
+ * - BPF progs can use of pointer-to-integer conversions
+ * - speculation attack hardening measures are bypassed
+ * - bpf_probe_read to read arbitrary kernel memory is allowed
+ * - bpf_trace_printk to print kernel memory is allowed
+ *
+ * CAP_SYS_ADMIN is required to use bpf_probe_write_user.
+ *
+ * CAP_SYS_ADMIN is required to iterate system wide loaded
+ * programs, maps, links, BTFs and convert their IDs to file descriptors.
+ *
+ * CAP_PERFMON and CAP_BPF are required to load tracing programs.
+ * CAP_NET_ADMIN and CAP_BPF are required to load networking programs.
+ */
+#define CAP_BPF 39
+
+#define CAP_LAST_CAP CAP_BPF
#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 95d77770353c..1d5bb0d983b2 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int ret, numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool unpriv = !capable(CAP_SYS_ADMIN);
+ bool bypass_spec_v1 = bpf_bypass_spec_v1();
u64 cost, array_size, mask64;
struct bpf_map_memory mem;
struct bpf_array *array;
@@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
mask64 -= 1;
index_mask = mask64;
- if (unpriv) {
+ if (!bypass_spec_v1) {
/* round up array size to nearest power of 2,
* since cpu will speculate within index_mask limits
*/
@@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
}
array->index_mask = index_mask;
- array->map.unpriv_array = unpriv;
+ array->map.bypass_spec_v1 = bypass_spec_v1;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
@@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- if (map->unpriv_array) {
+ if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
@@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- if (map->unpriv_array) {
+ if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 26cb51f2db72..c6b0decaa46a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
struct bpf_map *map;
int err;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6aa11de67315..c40ff4cf9880 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
return;
bpf_prog_ksym_set_addr(fp);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 3fe0b006d2d2..8b85bfddfac7 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
u64 cost;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
/* check sanity of attributes */
@@ -162,25 +162,10 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
/* Part of headroom was reserved to xdpf */
hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
- /* build_skb need to place skb_shared_info after SKB end, and
- * also want to know the memory "truesize". Thus, need to
- * know the memory frame size backing xdp_buff.
- *
- * XDP was designed to have PAGE_SIZE frames, but this
- * assumption is not longer true with ixgbe and i40e. It
- * would be preferred to set frame_size to 2048 or 4096
- * depending on the driver.
- * frame_size = 2048;
- * frame_len = frame_size - sizeof(*xdp_frame);
- *
- * Instead, with info avail, skb_shared_info in placed after
- * packet len. This, unfortunately fakes the truesize.
- * Another disadvantage of this approach, the skb_shared_info
- * is not at a fixed memory location, with mixed length
- * packets, which is bad for cache-line hotness.
+ /* Memory size backing xdp_frame data already have reserved
+ * room for build_skb to place skb_shared_info in tailroom.
*/
- frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ frame_size = xdpf->frame_sz;
pkt_data_start = xdpf->data - hard_start_headroom;
skb = build_skb_around(skb, pkt_data_start, frame_size);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d541c8486c95..b4b288a3c3c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !capable(CAP_SYS_ADMIN))
+ if (lru && !bpf_capable())
/* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ * maps. Hence, limit to CAP_BPF.
*/
return -EPERM;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5c0290e0696e..886949fdcece 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
break;
}
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return NULL;
switch (func_id) {
@@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_spin_unlock:
return &bpf_spin_unlock_proto;
case BPF_FUNC_trace_printk:
+ if (!perfmon_capable())
+ return NULL;
return bpf_get_trace_printk_proto();
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 65c236cf341e..c8cc4e4cf98d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
u64 cost = sizeof(*trie), cost_per_node;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
/* check sanity of attributes */
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index b3c48d1533cb..17738c93bec8 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
if (inner_map->ops == &array_map_ops) {
- inner_map_meta->unpriv_array = inner_map->unpriv_array;
+ inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
container_of(inner_map_meta, struct bpf_array, map)->index_mask =
container_of(inner_map, struct bpf_array, map)->index_mask;
}
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 30e1373fd437..05c8e043b9d2 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return -EPERM;
/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 01badd3eda7a..21cde24386db 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
struct bpf_map_memory mem;
u64 array_size;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
array_size = sizeof(*array);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index db76339fe358..7b8381ce40a0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
u64 cost, n_buckets;
int err;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index de2a75500233..79bcd8d056d2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr)
err = -EBUSY;
goto err_put;
}
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
err = -EPERM;
goto err_put;
}
@@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
}
}
+static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ case BPF_PROG_TYPE_SK_SKB:
+ case BPF_PROG_TYPE_SK_MSG:
+ case BPF_PROG_TYPE_LIRC_MODE2:
+ case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
+ return true;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ /* always unpriv */
+ case BPF_PROG_TYPE_SK_REUSEPORT:
+ /* equivalent to SOCKET_FILTER. need CAP_BPF only */
+ default:
+ return false;
+ }
+}
+
+static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
+ case BPF_PROG_TYPE_TRACING:
+ case BPF_PROG_TYPE_LSM:
+ case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */
+ case BPF_PROG_TYPE_EXT: /* extends any prog */
+ return true;
+ default:
+ return false;
+ }
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
@@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
return -EPERM;
/* copy eBPF program license from user space */
@@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
is_gpl = license_is_gpl_compatible(license);
if (attr->insn_cnt == 0 ||
- attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+ attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
+ return -EPERM;
+
+ if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (is_perfmon_prog_type(type) && !perfmon_capable())
return -EPERM;
bpf_prog_load_fixup_attach_type(attr);
@@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
case BPF_PROG_TYPE_CGROUP_SKB:
+ if (!capable(CAP_NET_ADMIN))
+ /* cg-skb progs can be loaded by unpriv user.
+ * check permissions at attach time.
+ */
+ return -EPERM;
return prog->enforce_expected_attach_type &&
prog->expected_attach_type != attach_type ?
-EINVAL : 0;
@@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
struct bpf_prog *prog;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
@@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_PROG_DETACH))
return -EINVAL;
@@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_LIRC_MODE2:
return lirc_prog_detach(attr);
case BPF_PROG_TYPE_FLOW_DISSECTOR:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
return skb_flow_dissector_bpf_prog_detach(attr);
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SKB:
@@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
struct bpf_prog *prog;
int ret = -ENOTSUPP;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
@@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.run_time_ns = stats.nsecs;
info.run_cnt = stats.cnt;
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
info.nr_jited_ksyms = 0;
@@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return -EPERM;
return btf_new_fd(attr);
@@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr)
struct bpf_prog *prog;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
@@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr)
u32 flags;
int ret;
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
if (CHECK_ATTR(BPF_LINK_UPDATE))
return -EINVAL;
@@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
union bpf_attr attr;
int err;
- if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+ if (sysctl_unprivileged_bpf_disabled && !bpf_capable())
return -EPERM;
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a3f2af756fd6..180933f6fba9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
reg->type = SCALAR_VALUE;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks;
+ reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
__mark_reg_unbounded(reg);
}
@@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env)
continue;
if (insn[i].src_reg != BPF_PSEUDO_CALL)
continue;
- if (!env->allow_ptr_leaks) {
- verbose(env, "function calls to other bpf functions are allowed for root only\n");
+ if (!env->bpf_capable) {
+ verbose(env,
+ "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
return -EPERM;
}
ret = add_subprog(env, i + insn[i].imm + 1);
@@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
bool new_marks = false;
int i, err;
- if (!env->allow_ptr_leaks)
- /* backtracking is root only for now */
+ if (!env->bpf_capable)
return 0;
func = st->frame[st->curframe];
@@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
reg = &cur->regs[value_regno];
if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
- !register_is_null(reg) && env->allow_ptr_leaks) {
+ !register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
* stack slot address like [fp - 8]. Other spill of
@@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (!env->allow_ptr_leaks) {
+ if (!env->bypass_spec_v4) {
bool sanitize = false;
if (state->stack[spi].slot_type[0] == STACK_SPILL &&
@@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
* Spectre masking for stack ALU.
* See also retrieve_ptr_limit().
*/
- if (!env->allow_ptr_leaks) {
+ if (!env->bypass_spec_v1) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
@@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
if (!BPF_MAP_PTR(aux->map_ptr_state))
bpf_map_ptr_store(aux, meta->map_ptr,
- meta->map_ptr->unpriv_array);
+ !meta->map_ptr->bypass_spec_v1);
else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
- meta->map_ptr->unpriv_array);
+ !meta->map_ptr->bypass_spec_v1);
return 0;
}
@@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
const struct bpf_insn *insn)
{
- return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K;
+ return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
}
static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
@@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* For unprivileged we require that resulting offset must be in bounds
* in order to be able to sanitize access later on.
*/
- if (!env->allow_ptr_leaks) {
+ if (!env->bypass_spec_v1) {
if (dst_reg->type == PTR_TO_MAP_VALUE &&
check_map_access(env, dst, dst_reg->off, 1, false)) {
verbose(env, "R%d pointer arithmetic of map value goes out of range, "
@@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
insn_stack[env->cfg.cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- if (loop_ok && env->allow_ptr_leaks)
+ if (loop_ok && env->bpf_capable)
return 0;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
@@ -8353,7 +8353,7 @@ next:
if (env->max_states_per_insn < states_cnt)
env->max_states_per_insn = states_cnt;
- if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
+ if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
return push_jmp_history(env, cur);
if (!add_new_state)
@@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (env->allow_ptr_leaks && !expect_blinding &&
+ if (env->bpf_capable && !expect_blinding &&
prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
@@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
- is_priv = capable(CAP_SYS_ADMIN);
+ is_priv = bpf_capable();
if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
mutex_lock(&bpf_verifier_lock);
@@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
- env->allow_ptr_leaks = is_priv;
+ env->allow_ptr_leaks = bpf_allow_ptr_leaks();
+ env->bypass_spec_v1 = bpf_bypass_spec_v1();
+ env->bypass_spec_v4 = bpf_bypass_spec_v4();
+ env->bpf_capable = bpf_capable();
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d961428fb5b6..9a84d7fb4869 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
+ if (!capable(CAP_SYS_ADMIN))
+ return NULL;
+
pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
current->comm, task_pid_nr(current));
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 29dbdd4c29f6..30ba7d38941d 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -470,25 +470,34 @@ out:
int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
+ u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ u32 headroom = XDP_PACKET_HEADROOM;
u32 size = kattr->test.data_size_in;
u32 repeat = kattr->test.repeat;
struct netdev_rx_queue *rxqueue;
struct xdp_buff xdp = {};
u32 retval, duration;
+ u32 max_data_sz;
void *data;
int ret;
if (kattr->test.ctx_in || kattr->test.ctx_out)
return -EINVAL;
- data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0);
+ /* XDP have extra tailroom as (most) drivers use full page */
+ max_data_sz = 4096 - headroom - tailroom;
+ if (size > max_data_sz)
+ return -EINVAL;
+
+ data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
if (IS_ERR(data))
return PTR_ERR(data);
xdp.data_hard_start = data;
- xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
+ xdp.data = data + headroom;
xdp.data_meta = xdp.data;
xdp.data_end = xdp.data + size;
+ xdp.frame_sz = headroom + max_data_sz + tailroom;
rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
xdp.rxq = &rxqueue->xdp_rxq;
@@ -496,8 +505,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
if (ret)
goto out;
- if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
- xdp.data_end != xdp.data + size)
+ if (xdp.data != data + headroom || xdp.data_end != xdp.data + size)
size = xdp.data_end - xdp.data;
ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
out:
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 756b63b6f7b3..d2c4d16dadba 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
!attr->btf_key_type_id || !attr->btf_value_type_id)
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return -EPERM;
if (attr->value_size > MAX_VALUE_SIZE)
@@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
/* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
* the map_alloc_check() side also does.
*/
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
nla_for_each_nested(nla, nla_stgs, rem) {
diff --git a/net/core/dev.c b/net/core/dev.c
index 4c91de39890a..f937a3ff668d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4617,6 +4617,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
xdp->data_meta = xdp->data;
xdp->data_end = xdp->data + hlen;
xdp->data_hard_start = skb->data - skb_headroom(skb);
+
+ /* SKB "head" area always have tailroom for skb_shared_info */
+ xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start;
+ xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
orig_data_end = xdp->data_end;
orig_data = xdp->data;
eth = (struct ethhdr *)xdp->data;
@@ -4640,14 +4645,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
skb_reset_network_header(skb);
}
- /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
- * pckt.
- */
- off = orig_data_end - xdp->data_end;
+ /* check if bpf_xdp_adjust_tail was used */
+ off = xdp->data_end - orig_data_end;
if (off != 0) {
skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
- skb->len -= off;
-
+ skb->len += off; /* positive on grow, negative on shrink */
}
/* check if XDP changed eth hdr such SKB needs update */
diff --git a/net/core/filter.c b/net/core/filter.c
index 5815902bb617..f8a3c7e9d027 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3411,15 +3411,26 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
+ void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
void *data_end = xdp->data_end + offset;
- /* only shrinking is allowed for now. */
- if (unlikely(offset >= 0))
+ /* Notice that xdp_data_hard_end have reserved some tailroom */
+ if (unlikely(data_end > data_hard_end))
return -EINVAL;
+ /* ALL drivers MUST init xdp->frame_sz, chicken check below */
+ if (unlikely(xdp->frame_sz > PAGE_SIZE)) {
+ WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz);
+ return -EINVAL;
+ }
+
if (unlikely(data_end < xdp->data + ETH_HLEN))
return -EINVAL;
+ /* Clear memory area on grow, can contain uninit kernel memory */
+ if (offset > 0)
+ memset(xdp->data_end, 0, offset);
+
xdp->data_end = data_end;
return 0;
@@ -6676,7 +6687,7 @@ static bool cg_skb_is_valid_access(int off, int size,
return false;
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return false;
break;
}
@@ -6688,7 +6699,7 @@ static bool cg_skb_is_valid_access(int off, int size,
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
case bpf_ctx_range(struct __sk_buff, tstamp):
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return false;
break;
default:
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 4c7ea85486af..490b8f5fa8ee 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
+#include <linux/bug.h>
#include <net/page_pool.h>
#include <net/xdp.h>
@@ -496,3 +497,10 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
return xdpf;
}
EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
+
+/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */
+void xdp_warn(const char *msg, const char *func, const int line)
+{
+ WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg);
+};
+EXPORT_SYMBOL_GPL(xdp_warn);
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index d233ab3f1533..98e1513b608a 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -27,9 +27,9 @@
"audit_control", "setfcap"
#define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \
- "wake_alarm", "block_suspend", "audit_read", "perfmon"
+ "wake_alarm", "block_suspend", "audit_read", "perfmon", "bpf"
-#if CAP_LAST_CAP > CAP_PERFMON
+#if CAP_LAST_CAP > CAP_BPF
#error New capability defined, please update COMMON_CAP2_PERMS.
#endif
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index f54347f55ee0..1b73e63274b5 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -80,13 +80,12 @@ print_bool_feature(const char *feat_name, const char *plain_name,
printf("%s is %savailable\n", plain_name, res ? "" : "NOT ");
}
-static void print_kernel_option(const char *name, const char *value)
+static void print_kernel_option(const char *name, const char *value,
+ const char *define_prefix)
{
char *endptr;
int res;
- /* No support for C-style ouptut */
-
if (json_output) {
if (!value) {
jsonw_null_field(json_wtr, name);
@@ -98,6 +97,12 @@ static void print_kernel_option(const char *name, const char *value)
jsonw_int_field(json_wtr, name, res);
else
jsonw_string_field(json_wtr, name, value);
+ } else if (define_prefix) {
+ if (value)
+ printf("#define %s%s %s\n", define_prefix,
+ name, value);
+ else
+ printf("/* %s%s is not set */\n", define_prefix, name);
} else {
if (value)
printf("%s is set to %s\n", name, value);
@@ -315,77 +320,84 @@ static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n,
return false;
}
-static void probe_kernel_image_config(void)
+static void probe_kernel_image_config(const char *define_prefix)
{
- static const char * const options[] = {
+ static const struct {
+ const char * const name;
+ bool macro_dump;
+ } options[] = {
/* Enable BPF */
- "CONFIG_BPF",
+ { "CONFIG_BPF", },
/* Enable bpf() syscall */
- "CONFIG_BPF_SYSCALL",
+ { "CONFIG_BPF_SYSCALL", },
/* Does selected architecture support eBPF JIT compiler */
- "CONFIG_HAVE_EBPF_JIT",
+ { "CONFIG_HAVE_EBPF_JIT", },
/* Compile eBPF JIT compiler */
- "CONFIG_BPF_JIT",
+ { "CONFIG_BPF_JIT", },
/* Avoid compiling eBPF interpreter (use JIT only) */
- "CONFIG_BPF_JIT_ALWAYS_ON",
+ { "CONFIG_BPF_JIT_ALWAYS_ON", },
/* cgroups */
- "CONFIG_CGROUPS",
+ { "CONFIG_CGROUPS", },
/* BPF programs attached to cgroups */
- "CONFIG_CGROUP_BPF",
+ { "CONFIG_CGROUP_BPF", },
/* bpf_get_cgroup_classid() helper */
- "CONFIG_CGROUP_NET_CLASSID",
+ { "CONFIG_CGROUP_NET_CLASSID", },
/* bpf_skb_{,ancestor_}cgroup_id() helpers */
- "CONFIG_SOCK_CGROUP_DATA",
+ { "CONFIG_SOCK_CGROUP_DATA", },
/* Tracing: attach BPF to kprobes, tracepoints, etc. */
- "CONFIG_BPF_EVENTS",
+ { "CONFIG_BPF_EVENTS", },
/* Kprobes */
- "CONFIG_KPROBE_EVENTS",
+ { "CONFIG_KPROBE_EVENTS", },
/* Uprobes */
- "CONFIG_UPROBE_EVENTS",
+ { "CONFIG_UPROBE_EVENTS", },
/* Tracepoints */
- "CONFIG_TRACING",
+ { "CONFIG_TRACING", },
/* Syscall tracepoints */
- "CONFIG_FTRACE_SYSCALLS",
+ { "CONFIG_FTRACE_SYSCALLS", },
/* bpf_override_return() helper support for selected arch */
- "CONFIG_FUNCTION_ERROR_INJECTION",
+ { "CONFIG_FUNCTION_ERROR_INJECTION", },
/* bpf_override_return() helper */
- "CONFIG_BPF_KPROBE_OVERRIDE",
+ { "CONFIG_BPF_KPROBE_OVERRIDE", },
/* Network */
- "CONFIG_NET",
+ { "CONFIG_NET", },
/* AF_XDP sockets */
- "CONFIG_XDP_SOCKETS",
+ { "CONFIG_XDP_SOCKETS", },
/* BPF_PROG_TYPE_LWT_* and related helpers */
- "CONFIG_LWTUNNEL_BPF",
+ { "CONFIG_LWTUNNEL_BPF", },
/* BPF_PROG_TYPE_SCHED_ACT, TC (traffic control) actions */
- "CONFIG_NET_ACT_BPF",
+ { "CONFIG_NET_ACT_BPF", },
/* BPF_PROG_TYPE_SCHED_CLS, TC filters */
- "CONFIG_NET_CLS_BPF",
+ { "CONFIG_NET_CLS_BPF", },
/* TC clsact qdisc */
- "CONFIG_NET_CLS_ACT",
+ { "CONFIG_NET_CLS_ACT", },
/* Ingress filtering with TC */
- "CONFIG_NET_SCH_INGRESS",
+ { "CONFIG_NET_SCH_INGRESS", },
/* bpf_skb_get_xfrm_state() helper */
- "CONFIG_XFRM",
+ { "CONFIG_XFRM", },
/* bpf_get_route_realm() helper */
- "CONFIG_IP_ROUTE_CLASSID",
+ { "CONFIG_IP_ROUTE_CLASSID", },
/* BPF_PROG_TYPE_LWT_SEG6_LOCAL and related helpers */
- "CONFIG_IPV6_SEG6_BPF",
+ { "CONFIG_IPV6_SEG6_BPF", },
/* BPF_PROG_TYPE_LIRC_MODE2 and related helpers */
- "CONFIG_BPF_LIRC_MODE2",
+ { "CONFIG_BPF_LIRC_MODE2", },
/* BPF stream parser and BPF socket maps */
- "CONFIG_BPF_STREAM_PARSER",
+ { "CONFIG_BPF_STREAM_PARSER", },
/* xt_bpf module for passing BPF programs to netfilter */
- "CONFIG_NETFILTER_XT_MATCH_BPF",
+ { "CONFIG_NETFILTER_XT_MATCH_BPF", },
/* bpfilter back-end for iptables */
- "CONFIG_BPFILTER",
+ { "CONFIG_BPFILTER", },
/* bpftilter module with "user mode helper" */
- "CONFIG_BPFILTER_UMH",
+ { "CONFIG_BPFILTER_UMH", },
/* test_bpf module for BPF tests */
- "CONFIG_TEST_BPF",
+ { "CONFIG_TEST_BPF", },
+
+ /* Misc configs useful in BPF C programs */
+ /* jiffies <-> sec conversion for bpf_jiffies64() helper */
+ { "CONFIG_HZ", true, }
};
char *values[ARRAY_SIZE(options)] = { };
struct utsname utsn;
@@ -427,7 +439,8 @@ static void probe_kernel_image_config(void)
while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) {
for (i = 0; i < ARRAY_SIZE(options); i++) {
- if (values[i] || strcmp(buf, options[i]))
+ if ((define_prefix && !options[i].macro_dump) ||
+ values[i] || strcmp(buf, options[i].name))
continue;
values[i] = strdup(value);
@@ -439,7 +452,9 @@ end_parse:
gzclose(file);
for (i = 0; i < ARRAY_SIZE(options); i++) {
- print_kernel_option(options[i], values[i]);
+ if (define_prefix && !options[i].macro_dump)
+ continue;
+ print_kernel_option(options[i].name, values[i], define_prefix);
free(values[i]);
}
}
@@ -632,23 +647,22 @@ section_system_config(enum probe_component target, const char *define_prefix)
switch (target) {
case COMPONENT_KERNEL:
case COMPONENT_UNSPEC:
- if (define_prefix)
- break;
-
print_start_section("system_config",
"Scanning system configuration...",
- NULL, /* define_comment never used here */
- NULL); /* define_prefix always NULL here */
- if (check_procfs()) {
- probe_unprivileged_disabled();
- probe_jit_enable();
- probe_jit_harden();
- probe_jit_kallsyms();
- probe_jit_limit();
- } else {
- p_info("/* procfs not mounted, skipping related probes */");
+ "/*** Misc kernel config items ***/",
+ define_prefix);
+ if (!define_prefix) {
+ if (check_procfs()) {
+ probe_unprivileged_disabled();
+ probe_jit_enable();
+ probe_jit_harden();
+ probe_jit_kallsyms();
+ probe_jit_limit();
+ } else {
+ p_info("/* procfs not mounted, skipping related probes */");
+ }
}
- probe_kernel_image_config();
+ probe_kernel_image_config(define_prefix);
print_end_section();
break;
default:
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
index 6c8ca1c93f9b..d5c98f2cb12f 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
@@ -2,13 +2,13 @@
#include <test_progs.h>
#include <network_helpers.h>
-void test_xdp_adjust_tail(void)
+void test_xdp_adjust_tail_shrink(void)
{
- const char *file = "./test_adjust_tail.o";
+ const char *file = "./test_xdp_adjust_tail_shrink.o";
+ __u32 duration, retval, size, expect_sz;
struct bpf_object *obj;
- char buf[128];
- __u32 duration, retval, size;
int err, prog_fd;
+ char buf[128];
err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
if (CHECK_FAIL(err))
@@ -21,10 +21,121 @@ void test_xdp_adjust_tail(void)
"ipv4", "err %d errno %d retval %d size %d\n",
err, errno, retval, size);
+ expect_sz = sizeof(pkt_v6) - 20; /* Test shrink with 20 bytes */
err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
buf, &size, &retval, &duration);
- CHECK(err || retval != XDP_TX || size != 54,
- "ipv6", "err %d errno %d retval %d size %d\n",
+ CHECK(err || retval != XDP_TX || size != expect_sz,
+ "ipv6", "err %d errno %d retval %d size %d expect-size %d\n",
+ err, errno, retval, size, expect_sz);
+ bpf_object__close(obj);
+}
+
+void test_xdp_adjust_tail_grow(void)
+{
+ const char *file = "./test_xdp_adjust_tail_grow.o";
+ struct bpf_object *obj;
+ char buf[4096]; /* avoid segfault: large buf to hold grow results */
+ __u32 duration, retval, size, expect_sz;
+ int err, prog_fd;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != XDP_DROP,
+ "ipv4", "err %d errno %d retval %d size %d\n",
err, errno, retval, size);
+
+ expect_sz = sizeof(pkt_v6) + 40; /* Test grow with 40 bytes */
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6) /* 74 */,
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != XDP_TX || size != expect_sz,
+ "ipv6", "err %d errno %d retval %d size %d expect-size %d\n",
+ err, errno, retval, size, expect_sz);
+
+ bpf_object__close(obj);
+}
+
+void test_xdp_adjust_tail_grow2(void)
+{
+ const char *file = "./test_xdp_adjust_tail_grow.o";
+ char buf[4096]; /* avoid segfault: large buf to hold grow results */
+ int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/;
+ struct bpf_object *obj;
+ int err, cnt, i;
+ int max_grow;
+
+ struct bpf_prog_test_run_attr tattr = {
+ .repeat = 1,
+ .data_in = &buf,
+ .data_out = &buf,
+ .data_size_in = 0, /* Per test */
+ .data_size_out = 0, /* Per test */
+ };
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd);
+ if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+ return;
+
+ /* Test case-64 */
+ memset(buf, 1, sizeof(buf));
+ tattr.data_size_in = 64; /* Determine test case via pkt size */
+ tattr.data_size_out = 128; /* Limit copy_size */
+ /* Kernel side alloc packet memory area that is zero init */
+ err = bpf_prog_test_run_xattr(&tattr);
+
+ CHECK_ATTR(errno != ENOSPC /* Due limit copy_size in bpf_test_finish */
+ || tattr.retval != XDP_TX
+ || tattr.data_size_out != 192, /* Expected grow size */
+ "case-64",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, tattr.retval, tattr.data_size_out);
+
+ /* Extra checks for data contents */
+ CHECK_ATTR(tattr.data_size_out != 192
+ || buf[0] != 1 || buf[63] != 1 /* 0-63 memset to 1 */
+ || buf[64] != 0 || buf[127] != 0 /* 64-127 memset to 0 */
+ || buf[128] != 1 || buf[191] != 1, /*128-191 memset to 1 */
+ "case-64-data",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, tattr.retval, tattr.data_size_out);
+
+ /* Test case-128 */
+ memset(buf, 2, sizeof(buf));
+ tattr.data_size_in = 128; /* Determine test case via pkt size */
+ tattr.data_size_out = sizeof(buf); /* Copy everything */
+ err = bpf_prog_test_run_xattr(&tattr);
+
+ max_grow = 4096 - XDP_PACKET_HEADROOM - tailroom; /* 3520 */
+ CHECK_ATTR(err
+ || tattr.retval != XDP_TX
+ || tattr.data_size_out != max_grow,/* Expect max grow size */
+ "case-128",
+ "err %d errno %d retval %d size %d expect-size %d\n",
+ err, errno, tattr.retval, tattr.data_size_out, max_grow);
+
+ /* Extra checks for data content: Count grow size, will contain zeros */
+ for (i = 0, cnt = 0; i < sizeof(buf); i++) {
+ if (buf[i] == 0)
+ cnt++;
+ }
+ CHECK_ATTR((cnt != (max_grow - tattr.data_size_in)) /* Grow increase */
+ || tattr.data_size_out != max_grow, /* Total grow size */
+ "case-128-data",
+ "err %d errno %d retval %d size %d grow-size %d\n",
+ err, errno, tattr.retval, tattr.data_size_out, cnt);
+
bpf_object__close(obj);
}
+
+void test_xdp_adjust_tail(void)
+{
+ if (test__start_subtest("xdp_adjust_tail_shrink"))
+ test_xdp_adjust_tail_shrink();
+ if (test__start_subtest("xdp_adjust_tail_grow"))
+ test_xdp_adjust_tail_grow();
+ if (test__start_subtest("xdp_adjust_tail_grow2"))
+ test_xdp_adjust_tail_grow2();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
new file mode 100644
index 000000000000..3d66599eee2e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+SEC("xdp_adjust_tail_grow")
+int _xdp_adjust_tail_grow(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ unsigned int data_len;
+ int offset = 0;
+
+ /* Data length determine test case */
+ data_len = data_end - data;
+
+ if (data_len == 54) { /* sizeof(pkt_v4) */
+ offset = 4096; /* test too large offset */
+ } else if (data_len == 74) { /* sizeof(pkt_v6) */
+ offset = 40;
+ } else if (data_len == 64) {
+ offset = 128;
+ } else if (data_len == 128) {
+ offset = 4096 - 256 - 320 - data_len; /* Max tail grow 3520 */
+ } else {
+ return XDP_ABORTED; /* No matching test */
+ }
+
+ if (bpf_xdp_adjust_tail(xdp, offset))
+ return XDP_DROP;
+ return XDP_TX;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_adjust_tail.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c
index b7fc85769bdc..22065a9cfb25 100644
--- a/tools/testing/selftests/bpf/progs/test_adjust_tail.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c
@@ -1,5 +1,5 @@
-/* SPDX-License-Identifier: GPL-2.0
- * Copyright (c) 2018 Facebook
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -11,15 +11,15 @@
int _version SEC("version") = 1;
-SEC("xdp_adjust_tail")
-int _xdp_adjust_tail(struct xdp_md *xdp)
+SEC("xdp_adjust_tail_shrink")
+int _xdp_adjust_tail_shrink(struct xdp_md *xdp)
{
void *data_end = (void *)(long)xdp->data_end;
void *data = (void *)(long)xdp->data;
int offset = 0;
- if (data_end - data == 54)
- offset = 256;
+ if (data_end - data == 54) /* sizeof(pkt_v4) */
+ offset = 256; /* shrink too much */
else
offset = 20;
if (bpf_xdp_adjust_tail(xdp, 0 - offset))
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 21a1ce219c1c..78a6bae56ea6 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -818,10 +818,18 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
}
}
+struct libcap {
+ struct __user_cap_header_struct hdr;
+ struct __user_cap_data_struct data[2];
+};
+
static int set_admin(bool admin)
{
cap_t caps;
- const cap_value_t cap_val = CAP_SYS_ADMIN;
+ /* need CAP_BPF, CAP_NET_ADMIN, CAP_PERFMON to load progs */
+ const cap_value_t cap_net_admin = CAP_NET_ADMIN;
+ const cap_value_t cap_sys_admin = CAP_SYS_ADMIN;
+ struct libcap *cap;
int ret = -1;
caps = cap_get_proc();
@@ -829,11 +837,26 @@ static int set_admin(bool admin)
perror("cap_get_proc");
return -1;
}
- if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val,
+ cap = (struct libcap *)caps;
+ if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) {
+ perror("cap_set_flag clear admin");
+ goto out;
+ }
+ if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin,
admin ? CAP_SET : CAP_CLEAR)) {
- perror("cap_set_flag");
+ perror("cap_set_flag set_or_clear net");
goto out;
}
+ /* libcap is likely old and simply ignores CAP_BPF and CAP_PERFMON,
+ * so update effective bits manually
+ */
+ if (admin) {
+ cap->data[1].effective |= 1 << (38 /* CAP_PERFMON */ - 32);
+ cap->data[1].effective |= 1 << (39 /* CAP_BPF */ - 32);
+ } else {
+ cap->data[1].effective &= ~(1 << (38 - 32));
+ cap->data[1].effective &= ~(1 << (39 - 32));
+ }
if (cap_set_proc(caps)) {
perror("cap_set_proc");
goto out;
@@ -1067,9 +1090,11 @@ fail_log:
static bool is_admin(void)
{
+ cap_flag_value_t net_priv = CAP_CLEAR;
+ bool perfmon_priv = false;
+ bool bpf_priv = false;
+ struct libcap *cap;
cap_t caps;
- cap_flag_value_t sysadmin = CAP_CLEAR;
- const cap_value_t cap_val = CAP_SYS_ADMIN;
#ifdef CAP_IS_SUPPORTED
if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) {
@@ -1082,11 +1107,14 @@ static bool is_admin(void)
perror("cap_get_proc");
return false;
}
- if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin))
- perror("cap_get_flag");
+ cap = (struct libcap *)caps;
+ bpf_priv = cap->data[1].effective & (1 << (39/* CAP_BPF */ - 32));
+ perfmon_priv = cap->data[1].effective & (1 << (38/* CAP_PERFMON */ - 32));
+ if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv))
+ perror("cap_get_flag NET");
if (cap_free(caps))
perror("cap_free");
- return (sysadmin == CAP_SET);
+ return bpf_priv && perfmon_priv && net_priv == CAP_SET;
}
static void get_unpriv_disabled()
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 2d752c4f8d9d..7629a0cebb9b 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -19,7 +19,7 @@
BPF_MOV64_IMM(BPF_REG_0, 2),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 1,
@@ -315,7 +315,7 @@
BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "allowed for root only",
+ .errstr_unpriv = "allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = POINTER_VALUE,
@@ -346,7 +346,7 @@
BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "allowed for root only",
+ .errstr_unpriv = "allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN,
@@ -397,7 +397,7 @@
BPF_MOV64_IMM(BPF_REG_0, 1),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.fixup_map_hash_48b = { 3 },
.result_unpriv = REJECT,
.result = ACCEPT,
@@ -1064,7 +1064,7 @@
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "allowed for root only",
+ .errstr_unpriv = "allowed for",
.result_unpriv = REJECT,
.errstr = "R0 !read_ok",
.result = REJECT,
@@ -1977,7 +1977,7 @@
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
},
@@ -2003,7 +2003,7 @@
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.errstr = "!read_ok",
.result = REJECT,
},
@@ -2028,7 +2028,7 @@
BPF_EXIT_INSN(),
},
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.errstr = "!read_ok",
.result = REJECT,
},
diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c
index 50a8a63be4ac..5cf361d8eb1c 100644
--- a/tools/testing/selftests/bpf/verifier/dead_code.c
+++ b/tools/testing/selftests/bpf/verifier/dead_code.c
@@ -85,7 +85,7 @@
BPF_MOV64_IMM(BPF_REG_0, 12),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 7,
@@ -103,7 +103,7 @@
BPF_MOV64_IMM(BPF_REG_0, 12),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 7,
@@ -121,7 +121,7 @@
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 7,
@@ -137,7 +137,7 @@
BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 2,
@@ -152,7 +152,7 @@
BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
BPF_EXIT_INSN(),
},
- .errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
.result_unpriv = REJECT,
.result = ACCEPT,
.retval = 2,