aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-04-17 10:50:30 -0400
committerDavid S. Miller <davem@davemloft.net>2018-04-17 11:17:58 -0400
commit684009d4fdaf40f1e50b0589cff6e039e97058a4 (patch)
tree1f162a8267f6647ca930ef202dfc9ad0b0245fae /include
parentliquidio: Enhanced ethtool stats (diff)
parentxdp: avoid leaking info stored in frame data on page reuse (diff)
downloadlinux-dev-684009d4fdaf40f1e50b0589cff6e039e97058a4.tar.xz
linux-dev-684009d4fdaf40f1e50b0589cff6e039e97058a4.zip
Merge branch 'XDP-redirect-memory-return-API'
Jesper Dangaard Brouer says: ==================== XDP redirect memory return API Submitted against net-next, as it contains NIC driver changes. This patchset works towards supporting different XDP RX-ring memory allocators. As this will be needed by the AF_XDP zero-copy mode. The patchset uses mlx5 as the sample driver, which gets implemented XDP_REDIRECT RX-mode, but not ndo_xdp_xmit (as this API is subject to change thought the patchset). A new struct xdp_frame is introduced (modeled after cpumap xdp_pkt). And both ndo_xdp_xmit and the new xdp_return_frame end-up using this. Support for a driver supplied allocator is implemented, and a refurbished version of page_pool is the first return allocator type introduced. This will be a integration point for AF_XDP zero-copy. The mlx5 driver evolve into using the page_pool, and see a performance increase (with ndo_xdp_xmit out ixgbe driver) from 6Mpps to 12Mpps. The patchset stop at 16 patches (one over limit), but more API changes are planned. Specifically extending ndo_xdp_xmit and xdp_return_frame APIs to support bulking. As this will address some known limits. V2: Updated according to Tariq's feedback V3: Updated based on feedback from Jason Wang and Alex Duyck V4: Updated based on feedback from Tariq and Jason V5: Fix SPDX license, add Tariq's reviews, improve patch desc for perf test V6: Updated based on feedback from Eric Dumazet and Alex Duyck V7: Adapt to i40e that got XDP_REDIRECT support in-between V8: Updated based on feedback kbuild test robot, and adjust for mlx5 changes page_pool only compiled into kernel when drivers Kconfig 'select' feature V9: Remove some inline statements, let compiler decide what to inline Fix return value in virtio_net driver Adjust for mlx5 changes in-between submissions V10: Minor adjust for mlx5 requested by Tariq Resubmit against net-next V11: avoid leaking info stored in frame data on page reuse ==================== Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to '')
-rw-r--r--include/linux/filter.h24
-rw-r--r--include/linux/if_tun.h4
-rw-r--r--include/linux/netdevice.h4
-rw-r--r--include/net/page_pool.h143
-rw-r--r--include/net/xdp.h83
5 files changed, 231 insertions, 27 deletions
diff --git a/include/linux/filter.h b/include/linux/filter.h
index fc4e8f91b03d..4da8b2308174 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -30,6 +30,7 @@ struct sock;
struct seccomp_data;
struct bpf_prog_aux;
struct xdp_rxq_info;
+struct xdp_buff;
/* ArgX, context and stack frame pointer register positions. Note,
* Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -500,14 +501,6 @@ struct bpf_skb_data_end {
void *data_end;
};
-struct xdp_buff {
- void *data;
- void *data_end;
- void *data_meta;
- void *data_hard_start;
- struct xdp_rxq_info *rxq;
-};
-
struct sk_msg_buff {
void *data;
void *data_end;
@@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev,
struct bpf_prog *prog);
void xdp_do_flush_map(void);
-/* Drivers not supporting XDP metadata can use this helper, which
- * rejects any room expansion for metadata as a result.
- */
-static __always_inline void
-xdp_set_data_meta_invalid(struct xdp_buff *xdp)
-{
- xdp->data_meta = xdp->data + 1;
-}
-
-static __always_inline bool
-xdp_data_meta_unsupported(const struct xdp_buff *xdp)
-{
- return unlikely(xdp->data_meta > xdp->data);
-}
-
void bpf_warn_invalid_xdp_action(u32 act);
struct sock *do_sk_redirect_map(struct sk_buff *skb);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index fd00170b494f..3d2996dc7d85 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -22,7 +22,7 @@
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
struct ptr_ring *tun_get_tx_ring(struct file *file);
-bool tun_is_xdp_buff(void *ptr);
+bool tun_is_xdp_frame(void *ptr);
void *tun_xdp_to_ptr(void *ptr);
void *tun_ptr_to_xdp(void *ptr);
void tun_ptr_free(void *ptr);
@@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
{
return ERR_PTR(-EINVAL);
}
-static inline bool tun_is_xdp_buff(void *ptr)
+static inline bool tun_is_xdp_frame(void *ptr)
{
return false;
}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cf44503ea81a..14e0777ffcfb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1165,7 +1165,7 @@ struct dev_ifalias {
* This function is used to set or query state related to XDP on the
* netdevice and manage BPF offload. See definition of
* enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
+ * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
* This function is used to submit a XDP packet for transmit on a
* netdevice.
* void (*ndo_xdp_flush)(struct net_device *dev);
@@ -1356,7 +1356,7 @@ struct net_device_ops {
int (*ndo_bpf)(struct net_device *dev,
struct netdev_bpf *bpf);
int (*ndo_xdp_xmit)(struct net_device *dev,
- struct xdp_buff *xdp);
+ struct xdp_frame *xdp);
void (*ndo_xdp_flush)(struct net_device *dev);
};
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
new file mode 100644
index 000000000000..c79087153148
--- /dev/null
+++ b/include/net/page_pool.h
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.h
+ * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ * Copyright (C) 2016 Red Hat, Inc.
+ */
+
+/**
+ * DOC: page_pool allocator
+ *
+ * This page_pool allocator is optimized for the XDP mode that
+ * uses one-frame-per-page, but have fallbacks that act like the
+ * regular page allocator APIs.
+ *
+ * Basic use involve replacing alloc_pages() calls with the
+ * page_pool_alloc_pages() call. Drivers should likely use
+ * page_pool_dev_alloc_pages() replacing dev_alloc_pages().
+ *
+ * If page_pool handles DMA mapping (use page->private), then API user
+ * is responsible for invoking page_pool_put_page() once. In-case of
+ * elevated refcnt, the DMA state is released, assuming other users of
+ * the page will eventually call put_page().
+ *
+ * If no DMA mapping is done, then it can act as shim-layer that
+ * fall-through to alloc_page. As no state is kept on the page, the
+ * regular put_page() call is sufficient.
+ */
+#ifndef _NET_PAGE_POOL_H
+#define _NET_PAGE_POOL_H
+
+#include <linux/mm.h> /* Needed by ptr_ring */
+#include <linux/ptr_ring.h>
+#include <linux/dma-direction.h>
+
+#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */
+#define PP_FLAG_ALL PP_FLAG_DMA_MAP
+
+/*
+ * Fast allocation side cache array/stack
+ *
+ * The cache size and refill watermark is related to the network
+ * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX
+ * ring is usually refilled and the max consumed elements will be 64,
+ * thus a natural max size of objects needed in the cache.
+ *
+ * Keeping room for more objects, is due to XDP_DROP use-case. As
+ * XDP_DROP allows the opportunity to recycle objects directly into
+ * this array, as it shares the same softirq/NAPI protection. If
+ * cache is already full (or partly full) then the XDP_DROP recycles
+ * would have to take a slower code path.
+ */
+#define PP_ALLOC_CACHE_SIZE 128
+#define PP_ALLOC_CACHE_REFILL 64
+struct pp_alloc_cache {
+ u32 count;
+ void *cache[PP_ALLOC_CACHE_SIZE];
+};
+
+struct page_pool_params {
+ unsigned int flags;
+ unsigned int order;
+ unsigned int pool_size;
+ int nid; /* Numa node id to allocate from pages from */
+ struct device *dev; /* device, for DMA pre-mapping purposes */
+ enum dma_data_direction dma_dir; /* DMA mapping direction */
+};
+
+struct page_pool {
+ struct rcu_head rcu;
+ struct page_pool_params p;
+
+ /*
+ * Data structure for allocation side
+ *
+ * Drivers allocation side usually already perform some kind
+ * of resource protection. Piggyback on this protection, and
+ * require driver to protect allocation side.
+ *
+ * For NIC drivers this means, allocate a page_pool per
+ * RX-queue. As the RX-queue is already protected by
+ * Softirq/BH scheduling and napi_schedule. NAPI schedule
+ * guarantee that a single napi_struct will only be scheduled
+ * on a single CPU (see napi_schedule).
+ */
+ struct pp_alloc_cache alloc ____cacheline_aligned_in_smp;
+
+ /* Data structure for storing recycled pages.
+ *
+ * Returning/freeing pages is more complicated synchronization
+ * wise, because free's can happen on remote CPUs, with no
+ * association with allocation resource.
+ *
+ * Use ptr_ring, as it separates consumer and producer
+ * effeciently, it a way that doesn't bounce cache-lines.
+ *
+ * TODO: Implement bulk return pages into this structure.
+ */
+ struct ptr_ring ring;
+};
+
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
+
+static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
+{
+ gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+ return page_pool_alloc_pages(pool, gfp);
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params);
+
+void page_pool_destroy(struct page_pool *pool);
+
+/* Never call this directly, use helpers below */
+void __page_pool_put_page(struct page_pool *pool,
+ struct page *page, bool allow_direct);
+
+static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
+{
+ /* When page_pool isn't compiled-in, net/core/xdp.c doesn't
+ * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
+ */
+#ifdef CONFIG_PAGE_POOL
+ __page_pool_put_page(pool, page, false);
+#endif
+}
+/* Very limited use-cases allow recycle direct */
+static inline void page_pool_recycle_direct(struct page_pool *pool,
+ struct page *page)
+{
+ __page_pool_put_page(pool, page, true);
+}
+
+static inline bool is_page_pool_compiled_in(void)
+{
+#ifdef CONFIG_PAGE_POOL
+ return true;
+#else
+ return false;
+#endif
+}
+
+#endif /* _NET_PAGE_POOL_H */
diff --git a/include/net/xdp.h b/include/net/xdp.h
index b2362ddfa694..137ad5f9f40f 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -33,16 +33,99 @@
* also mandatory during RX-ring setup.
*/
+enum xdp_mem_type {
+ MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
+ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */
+ MEM_TYPE_PAGE_POOL,
+ MEM_TYPE_MAX,
+};
+
+struct xdp_mem_info {
+ u32 type; /* enum xdp_mem_type, but known size type */
+ u32 id;
+};
+
+struct page_pool;
+
struct xdp_rxq_info {
struct net_device *dev;
u32 queue_index;
u32 reg_state;
+ struct xdp_mem_info mem;
} ____cacheline_aligned; /* perf critical, avoid false-sharing */
+struct xdp_buff {
+ void *data;
+ void *data_end;
+ void *data_meta;
+ void *data_hard_start;
+ struct xdp_rxq_info *rxq;
+};
+
+struct xdp_frame {
+ void *data;
+ u16 len;
+ u16 headroom;
+ u16 metasize;
+ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+ * while mem info is valid on remote CPU.
+ */
+ struct xdp_mem_info mem;
+ struct net_device *dev_rx; /* used by cpumap */
+};
+
+/* Convert xdp_buff to xdp_frame */
+static inline
+struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
+{
+ struct xdp_frame *xdp_frame;
+ int metasize;
+ int headroom;
+
+ /* Assure headroom is available for storing info */
+ headroom = xdp->data - xdp->data_hard_start;
+ metasize = xdp->data - xdp->data_meta;
+ metasize = metasize > 0 ? metasize : 0;
+ if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
+ return NULL;
+
+ /* Store info in top of packet */
+ xdp_frame = xdp->data_hard_start;
+
+ xdp_frame->data = xdp->data;
+ xdp_frame->len = xdp->data_end - xdp->data;
+ xdp_frame->headroom = headroom - sizeof(*xdp_frame);
+ xdp_frame->metasize = metasize;
+
+ /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+ xdp_frame->mem = xdp->rxq->mem;
+
+ return xdp_frame;
+}
+
+void xdp_return_frame(struct xdp_frame *xdpf);
+
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
struct net_device *dev, u32 queue_index);
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+ enum xdp_mem_type type, void *allocator);
+
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+ xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+ return unlikely(xdp->data_meta > xdp->data);
+}
#endif /* __LINUX_NET_XDP_H__ */