aboutsummaryrefslogtreecommitdiffstats
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--net/core/skbuff.c622
1 files changed, 354 insertions, 268 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ba2f38246f07..88fa40571d0c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -80,7 +80,7 @@
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>
-#include "datagram.h"
+#include "dev.h"
#include "sock_destructor.h"
struct kmem_cache *skbuff_head_cache __ro_after_init;
@@ -91,6 +91,13 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
+#undef FN
+#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
+const char * const drop_reasons[] = {
+ DEFINE_DROP_REASON(FN, FN)
+};
+EXPORT_SYMBOL(drop_reasons);
+
/**
* skb_panic - private function for out-of-line support
* @skb: buffer
@@ -127,8 +134,66 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
#define NAPI_SKB_CACHE_BULK 16
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
+#if PAGE_SIZE == SZ_4K
+
+#define NAPI_HAS_SMALL_PAGE_FRAG 1
+#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
+
+/* specialized page frag allocator using a single order 0 page
+ * and slicing it into 1K sized fragment. Constrained to systems
+ * with a very limited amount of 1K fragments fitting a single
+ * page - to avoid excessive truesize underestimation
+ */
+
+struct page_frag_1k {
+ void *va;
+ u16 offset;
+ bool pfmemalloc;
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
+{
+ struct page *page;
+ int offset;
+
+ offset = nc->offset - SZ_1K;
+ if (likely(offset >= 0))
+ goto use_frag;
+
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+ if (!page)
+ return NULL;
+
+ nc->va = page_address(page);
+ nc->pfmemalloc = page_is_pfmemalloc(page);
+ offset = PAGE_SIZE - SZ_1K;
+ page_ref_add(page, offset / SZ_1K);
+
+use_frag:
+ nc->offset = offset;
+ return nc->va + offset;
+}
+#else
+
+/* the small page is actually unused in this build; add dummy helpers
+ * to please the compiler and avoid later preprocessor's conditionals
+ */
+#define NAPI_HAS_SMALL_PAGE_FRAG 0
+#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
+
+struct page_frag_1k {
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
+{
+ return NULL;
+}
+
+#endif
+
struct napi_alloc_cache {
struct page_frag_cache page;
+ struct page_frag_1k page_small;
unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -136,6 +201,23 @@ struct napi_alloc_cache {
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+void napi_get_frags_check(struct napi_struct *napi)
+{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -172,13 +254,14 @@ static struct sk_buff *napi_skb_cache_get(void)
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
- if (unlikely(!nc->skb_count))
+ if (unlikely(!nc->skb_count)) {
nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
GFP_ATOMIC,
NAPI_SKB_CACHE_BULK,
nc->skb_cache);
- if (unlikely(!nc->skb_count))
- return NULL;
+ if (unlikely(!nc->skb_count))
+ return NULL;
+ }
skb = nc->skb_cache[--nc->skb_count];
kasan_unpoison_object_data(skbuff_head_cache, skb);
@@ -201,10 +284,10 @@ static void __build_skb_around(struct sk_buff *skb, void *data,
skb->head = data;
skb->data = data;
skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
+ skb_set_end_offset(skb, size);
skb->mac_header = (typeof(skb->mac_header))~0U;
skb->transport_header = (typeof(skb->transport_header))~0U;
-
+ skb->alloc_cpu = raw_smp_processor_id();
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
@@ -450,8 +533,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->fclone = SKB_FCLONE_ORIG;
refcount_set(&fclones->fclone_ref, 1);
-
- fclones->skb2.fclone = SKB_FCLONE_CLONE;
}
return skb;
@@ -555,14 +636,18 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
{
struct napi_alloc_cache *nc;
struct sk_buff *skb;
+ bool pfmemalloc;
void *data;
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
len += NET_SKB_PAD + NET_IP_ALIGN;
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
+ * When the small frag allocator is available, prefer it over kmalloc
+ * for small fragments
*/
- if (len <= SKB_WITH_OVERHEAD(1024) ||
+ if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -573,13 +658,33 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
}
nc = this_cpu_ptr(&napi_alloc_cache);
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = page_frag_alloc(&nc->page, len, gfp_mask);
+ if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
+ /* we are artificially inflating the allocation size, but
+ * that is not as bad as it may look like, as:
+ * - 'len' less than GRO_MAX_HEAD makes little sense
+ * - On most systems, larger 'len' values lead to fragment
+ * size above 512 bytes
+ * - kmalloc would use the kmalloc-1k slab for such values
+ * - Builds with smaller GRO_MAX_HEAD will very likely do
+ * little networking, as that implies no WiFi and no
+ * tunnels support, and 32 bits arches.
+ */
+ len = SZ_1K;
+
+ data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
+ pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
+ } else {
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ len = SKB_DATA_ALIGN(len);
+
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = nc->page.pfmemalloc;
+ }
+
if (unlikely(!data))
return NULL;
@@ -589,7 +694,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
return NULL;
}
- if (nc->page.pfmemalloc)
+ if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -666,11 +771,18 @@ static void skb_release_data(struct sk_buff *skb)
&shinfo->dataref))
goto exit;
- skb_zcopy_clear(skb, true);
+ if (skb_zcopy(skb)) {
+ bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
+
+ skb_zcopy_clear(skb, true);
+ if (skip_unref)
+ goto free_head;
+ }
for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+free_head:
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
@@ -681,7 +793,7 @@ exit:
* while trying to recycle fragments on __skb_frag_unref() we need
* to make one SKB responsible for triggering the recycle path.
* So disable the recycling bit if an SKB is cloned and we have
- * additional references to to the fragmented part of the SKB.
+ * additional references to the fragmented part of the SKB.
* Eventually the last SKB will have the recycling bit set and it's
* dataref set to 0, which will trigger the recycling
*/
@@ -725,7 +837,7 @@ void skb_release_head_state(struct sk_buff *skb)
{
skb_dst_drop(skb);
if (skb->destructor) {
- WARN_ON(in_hardirq());
+ DEBUG_NET_WARN_ON_ONCE(in_hardirq());
skb->destructor(skb);
}
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -759,32 +871,38 @@ void __kfree_skb(struct sk_buff *skb)
EXPORT_SYMBOL(__kfree_skb);
/**
- * kfree_skb - free an sk_buff
+ * kfree_skb_reason - free an sk_buff with special reason
* @skb: buffer to free
+ * @reason: reason why this skb is dropped
*
* Drop a reference to the buffer and free it if the usage count has
- * hit zero.
+ * hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
+ * tracepoint.
*/
-void kfree_skb(struct sk_buff *skb)
+void __fix_address
+kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (!skb_unref(skb))
+ if (unlikely(!skb_unref(skb)))
return;
- trace_kfree_skb(skb, __builtin_return_address(0));
+ DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
+
+ trace_kfree_skb(skb, __builtin_return_address(0), reason);
__kfree_skb(skb);
}
-EXPORT_SYMBOL(kfree_skb);
+EXPORT_SYMBOL(kfree_skb_reason);
-void kfree_skb_list(struct sk_buff *segs)
+void kfree_skb_list_reason(struct sk_buff *segs,
+ enum skb_drop_reason reason)
{
while (segs) {
struct sk_buff *next = segs->next;
- kfree_skb(segs);
+ kfree_skb_reason(segs, reason);
segs = next;
}
}
-EXPORT_SYMBOL(kfree_skb_list);
+EXPORT_SYMBOL(kfree_skb_list_reason);
/* Dump skb information and contents.
*
@@ -832,7 +950,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
if (dev)
- printk("%sdev name=%s feat=0x%pNF\n",
+ printk("%sdev name=%s feat=%pNF\n",
level, dev->name, &dev->features);
if (sk)
printk("%ssk family=%hu type=%u proto=%u\n",
@@ -890,7 +1008,10 @@ EXPORT_SYMBOL(skb_dump);
*/
void skb_tx_error(struct sk_buff *skb)
{
- skb_zcopy_clear(skb, true);
+ if (skb) {
+ skb_zcopy_downgrade_managed(skb);
+ skb_zcopy_clear(skb, true);
+ }
}
EXPORT_SYMBOL(skb_tx_error);
@@ -973,7 +1094,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- lockdep_assert_in_softirq();
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
if (!skb_unref(skb))
return;
@@ -992,12 +1113,10 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
}
EXPORT_SYMBOL(napi_consume_skb);
-/* Make sure a field is enclosed inside headers_start/headers_end section */
+/* Make sure a field is contained by headers group */
#define CHECK_SKB_FIELD(field) \
- BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
- offsetof(struct sk_buff, headers_start)); \
- BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
- offsetof(struct sk_buff, headers_end)); \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) != \
+ offsetof(struct sk_buff, headers.field)); \
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
@@ -1009,14 +1128,12 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
__skb_ext_copy(new, old);
__nf_copy(new, old, false);
- /* Note : this field could be in headers_start/headers_end section
+ /* Note : this field could be in the headers group.
* It is not yet because we do not want to have a 16 bit hole
*/
new->queue_mapping = old->queue_mapping;
- memcpy(&new->headers_start, &old->headers_start,
- offsetof(struct sk_buff, headers_end) -
- offsetof(struct sk_buff, headers_start));
+ memcpy(&new->headers, &old->headers, sizeof(new->headers));
CHECK_SKB_FIELD(protocol);
CHECK_SKB_FIELD(csum);
CHECK_SKB_FIELD(hash);
@@ -1038,6 +1155,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#ifdef CONFIG_NET_RX_BUSY_POLL
CHECK_SKB_FIELD(napi_id);
#endif
+ CHECK_SKB_FIELD(alloc_cpu);
#ifdef CONFIG_XPS
CHECK_SKB_FIELD(sender_cpu);
#endif
@@ -1166,9 +1284,9 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
{
- struct ubuf_info *uarg;
+ struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
WARN_ON_ONCE(!in_task());
@@ -1186,20 +1304,19 @@ struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
return NULL;
}
- uarg->callback = msg_zerocopy_callback;
+ uarg->ubuf.callback = msg_zerocopy_callback;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- uarg->flags = SKBFL_ZEROCOPY_FRAG;
- refcount_set(&uarg->refcnt, 1);
+ uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ refcount_set(&uarg->ubuf.refcnt, 1);
sock_hold(sk);
- return uarg;
+ return &uarg->ubuf;
}
-EXPORT_SYMBOL_GPL(msg_zerocopy_alloc);
-static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
{
return container_of((void *)uarg, struct sk_buff, cb);
}
@@ -1208,9 +1325,14 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
struct ubuf_info *uarg)
{
if (uarg) {
+ struct ubuf_info_msgzc *uarg_zc;
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
+ /* there might be non MSG_ZEROCOPY users */
+ if (uarg->callback != msg_zerocopy_callback)
+ return NULL;
+
/* realloc only when socket is locked (TCP, UDP cork),
* so uarg->len and sk_zckey access is serialized
*/
@@ -1219,8 +1341,9 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
return NULL;
}
- bytelen = uarg->bytelen + size;
- if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ uarg_zc = uarg_to_msgzc(uarg);
+ bytelen = uarg_zc->bytelen + size;
+ if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
/* TCP can create new skb to attach new uarg */
if (sk->sk_type == SOCK_STREAM)
goto new_alloc;
@@ -1228,11 +1351,11 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
}
next = (u32)atomic_read(&sk->sk_zckey);
- if ((u32)(uarg->id + uarg->len) == next) {
- if (mm_account_pinned_pages(&uarg->mmp, size))
+ if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
+ if (mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
- uarg->len++;
- uarg->bytelen = bytelen;
+ uarg_zc->len++;
+ uarg_zc->bytelen = bytelen;
atomic_set(&sk->sk_zckey, ++next);
/* no extra ref when appending to datagram (MSG_MORE) */
@@ -1268,7 +1391,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
return true;
}
-static void __msg_zerocopy_callback(struct ubuf_info *uarg)
+static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
{
struct sk_buff *tail, *skb = skb_from_uarg(uarg);
struct sock_exterr_skb *serr;
@@ -1321,37 +1444,32 @@ release:
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
bool success)
{
- uarg->zerocopy = uarg->zerocopy & success;
+ struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
+
+ uarg_zc->zerocopy = uarg_zc->zerocopy & success;
if (refcount_dec_and_test(&uarg->refcnt))
- __msg_zerocopy_callback(uarg);
+ __msg_zerocopy_callback(uarg_zc);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
- struct sock *sk = skb_from_uarg(uarg)->sk;
+ struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;
atomic_dec(&sk->sk_zckey);
- uarg->len--;
+ uarg_to_msgzc(uarg)->len--;
if (have_uref)
msg_zerocopy_callback(NULL, uarg, true);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
-int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
-{
- return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
-}
-EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
-
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg)
{
struct ubuf_info *orig_uarg = skb_zcopy(skb);
- struct iov_iter orig_iter = msg->msg_iter;
int err, orig_len = skb->len;
/* An skb can only point to one uarg. This edge case happens when
@@ -1360,12 +1478,12 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
if (orig_uarg && uarg != orig_uarg)
return -EEXIST;
- err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
/* Streams do not free skb on error. Reset to prev state. */
- msg->msg_iter = orig_iter;
+ iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
skb->sk = sk;
___pskb_trim(skb, orig_len);
skb->sk = save_sk;
@@ -1377,6 +1495,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
+{
+ int i;
+
+ skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+}
+EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
+
static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
gfp_t gfp_mask)
{
@@ -1514,6 +1642,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
refcount_read(&fclones->fclone_ref) == 1) {
n = &fclones->skb2;
refcount_set(&fclones->fclone_ref, 2);
+ n->fclone = SKB_FCLONE_CLONE;
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
@@ -1694,6 +1823,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
BUG_ON(skb_shared(skb));
+ skb_zcopy_downgrade_managed(skb);
+
size = SKB_DATA_ALIGN(size);
if (skb_pfmemalloc(skb))
@@ -1738,11 +1869,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->head = data;
skb->head_frag = 0;
skb->data += off;
+
+ skb_set_end_offset(skb, size);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
off = nhead;
-#else
- skb->end = skb->head + size;
#endif
skb->tail += off;
skb_headers_offset_update(skb, nhead);
@@ -1790,6 +1920,38 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
}
EXPORT_SYMBOL(skb_realloc_headroom);
+int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
+{
+ unsigned int saved_end_offset, saved_truesize;
+ struct skb_shared_info *shinfo;
+ int res;
+
+ saved_end_offset = skb_end_offset(skb);
+ saved_truesize = skb->truesize;
+
+ res = pskb_expand_head(skb, 0, 0, pri);
+ if (res)
+ return res;
+
+ skb->truesize = saved_truesize;
+
+ if (likely(skb_end_offset(skb) == saved_end_offset))
+ return 0;
+
+ shinfo = skb_shinfo(skb);
+
+ /* We are about to change back skb->end,
+ * we need to move skb_shinfo() to its new location.
+ */
+ memmove(skb->head + saved_end_offset,
+ shinfo,
+ offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
+
+ skb_set_end_offset(skb, saved_end_offset);
+
+ return 0;
+}
+
/**
* skb_expand_head - reallocate header of &sk_buff
* @skb: buffer to reallocate
@@ -2028,6 +2190,30 @@ void *skb_pull(struct sk_buff *skb, unsigned int len)
EXPORT_SYMBOL(skb_pull);
/**
+ * skb_pull_data - remove data from the start of a buffer returning its
+ * original position.
+ * @skb: buffer to use
+ * @len: amount of data to remove
+ *
+ * This function removes data from the start of a buffer, returning
+ * the memory to the headroom. A pointer to the original data in the buffer
+ * is returned after checking if there is enough data to pull. Once the
+ * data has been pulled future pushes will overwrite the old data.
+ */
+void *skb_pull_data(struct sk_buff *skb, size_t len)
+{
+ void *data = skb->data;
+
+ if (skb->len < len)
+ return NULL;
+
+ skb_pull(skb, len);
+
+ return data;
+}
+EXPORT_SYMBOL(skb_pull_data);
+
+/**
* skb_trim - remove end from a buffer
* @skb: buffer to alter
* @len: new length
@@ -2254,7 +2440,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
/* Free pulled out fragments. */
while ((list = skb_shinfo(skb)->frag_list) != insp) {
skb_shinfo(skb)->frag_list = list->next;
- kfree_skb(list);
+ consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
@@ -3141,9 +3327,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
}
}
- to->truesize += len + plen;
- to->len += len + plen;
- to->data_len += len + plen;
+ skb_len_add(to, len + plen);
if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
skb_tx_error(from);
@@ -3435,6 +3619,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
int pos = skb_headlen(skb);
const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
+ skb_zcopy_downgrade_managed(skb);
+
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
@@ -3580,13 +3766,8 @@ onlymerged:
tgt->ip_summed = CHECKSUM_PARTIAL;
skb->ip_summed = CHECKSUM_PARTIAL;
- /* Yak, is it really working this way? Some helper please? */
- skb->len -= shiftlen;
- skb->data_len -= shiftlen;
- skb->truesize -= shiftlen;
- tgt->len += shiftlen;
- tgt->data_len += shiftlen;
- tgt->truesize += shiftlen;
+ skb_len_add(skb, -shiftlen);
+ skb_len_add(tgt, shiftlen);
return shiftlen;
}
@@ -3788,8 +3969,9 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
if (skb_can_coalesce(skb, i, page, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
} else if (i < MAX_SKB_FRAGS) {
+ skb_zcopy_downgrade_managed(skb);
get_page(page);
- skb_fill_page_desc(skb, i, page, offset, size);
+ skb_fill_page_desc_noacc(skb, i, page, offset, size);
} else {
return -EMSGSIZE;
}
@@ -3843,7 +4025,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
unsigned int delta_len = 0;
struct sk_buff *tail = NULL;
struct sk_buff *nskb, *tmp;
- int err;
+ int len_diff, err;
skb_push(skb, -skb_network_offset(skb) + offset);
@@ -3854,6 +4036,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
list_skb = list_skb->next;
err = 0;
+ delta_truesize += nskb->truesize;
if (skb_shared(nskb)) {
tmp = skb_clone(nskb, GFP_ATOMIC);
if (tmp) {
@@ -3878,14 +4061,15 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
tail = nskb;
delta_len += nskb->len;
- delta_truesize += nskb->truesize;
skb_push(nskb, -skb_network_offset(nskb) + offset);
skb_release_head_state(nskb);
+ len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
__copy_skb_header(nskb, skb);
skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
+ nskb->transport_header += len_diff;
skb_copy_from_linear_data_offset(skb, -tnl_hlen,
nskb->data - tnl_hlen,
offset + tnl_hlen);
@@ -3919,32 +4103,6 @@ err_linearize:
}
EXPORT_SYMBOL_GPL(skb_segment_list);
-int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
-{
- if (unlikely(p->len + skb->len >= 65536))
- return -E2BIG;
-
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
-
- skb_pull(skb, skb_gro_offset(skb));
-
- NAPI_GRO_CB(p)->last = skb;
- NAPI_GRO_CB(p)->count++;
- p->data_len += skb->len;
-
- /* sk owenrship - if any - completely transferred to the aggregated packet */
- skb->destructor = NULL;
- p->truesize += skb->truesize;
- p->len += skb->len;
-
- NAPI_GRO_CB(skb)->same_flow = 1;
-
- return 0;
-}
-
/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
@@ -3976,23 +4134,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
int i = 0;
int pos;
- if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) &&
- (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
- /* gso_size is untrusted, and we have a frag_list with a linear
- * non head_frag head.
- *
- * (we assume checking the first list_skb member suffices;
- * i.e if either of the list_skb members have non head_frag
- * head, then the first one has too).
- *
- * If head_skb's headlen does not fit requested gso_size, it
- * means that the frag_list members do NOT terminate on exact
- * gso_size boundaries. Hence we cannot perform skb_frag_t page
- * sharing. Therefore we must fallback to copying the frag_list
- * skbs; we do so by disabling SG.
- */
- if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb))
- features &= ~NETIF_F_SG;
+ if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
+ mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
+ struct sk_buff *check_skb;
+
+ for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
+ if (skb_headlen(check_skb) && !check_skb->head_frag) {
+ /* gso_size is untrusted, and we have a frag_list with
+ * a linear non head_frag item.
+ *
+ * If head_skb's headlen does not fit requested gso_size,
+ * it means that the frag_list members do NOT terminate
+ * on exact gso_size boundaries. Hence we cannot perform
+ * skb_frag_t page sharing. Therefore we must fallback to
+ * copying the frag_list skbs; we do so by disabling SG.
+ */
+ features &= ~NETIF_F_SG;
+ break;
+ }
+ }
}
__skb_push(head_skb, doffset);
@@ -4154,9 +4314,8 @@ normal:
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
} else {
- skb_copy_bits(head_skb, offset,
- skb_put(nskb, len),
- len);
+ if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
+ goto err;
}
continue;
}
@@ -4297,122 +4456,6 @@ err:
}
EXPORT_SYMBOL_GPL(skb_segment);
-int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
-{
- struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
- unsigned int offset = skb_gro_offset(skb);
- unsigned int headlen = skb_headlen(skb);
- unsigned int len = skb_gro_len(skb);
- unsigned int delta_truesize;
- unsigned int new_truesize;
- struct sk_buff *lp;
-
- if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
- return -E2BIG;
-
- lp = NAPI_GRO_CB(p)->last;
- pinfo = skb_shinfo(lp);
-
- if (headlen <= offset) {
- skb_frag_t *frag;
- skb_frag_t *frag2;
- int i = skbinfo->nr_frags;
- int nr_frags = pinfo->nr_frags + i;
-
- if (nr_frags > MAX_SKB_FRAGS)
- goto merge;
-
- offset -= headlen;
- pinfo->nr_frags = nr_frags;
- skbinfo->nr_frags = 0;
-
- frag = pinfo->frags + nr_frags;
- frag2 = skbinfo->frags + i;
- do {
- *--frag = *--frag2;
- } while (--i);
-
- skb_frag_off_add(frag, offset);
- skb_frag_size_sub(frag, offset);
-
- /* all fragments truesize : remove (head size + sk_buff) */
- new_truesize = SKB_TRUESIZE(skb_end_offset(skb));
- delta_truesize = skb->truesize - new_truesize;
-
- skb->truesize = new_truesize;
- skb->len -= skb->data_len;
- skb->data_len = 0;
-
- NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
- goto done;
- } else if (skb->head_frag) {
- int nr_frags = pinfo->nr_frags;
- skb_frag_t *frag = pinfo->frags + nr_frags;
- struct page *page = virt_to_head_page(skb->head);
- unsigned int first_size = headlen - offset;
- unsigned int first_offset;
-
- if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
- goto merge;
-
- first_offset = skb->data -
- (unsigned char *)page_address(page) +
- offset;
-
- pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
-
- __skb_frag_set_page(frag, page);
- skb_frag_off_set(frag, first_offset);
- skb_frag_size_set(frag, first_size);
-
- memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
- /* We dont need to clear skbinfo->nr_frags here */
-
- new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
- delta_truesize = skb->truesize - new_truesize;
- skb->truesize = new_truesize;
- NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
- goto done;
- }
-
-merge:
- /* sk owenrship - if any - completely transferred to the aggregated packet */
- skb->destructor = NULL;
- delta_truesize = skb->truesize;
- if (offset > headlen) {
- unsigned int eat = offset - headlen;
-
- skb_frag_off_add(&skbinfo->frags[0], eat);
- skb_frag_size_sub(&skbinfo->frags[0], eat);
- skb->data_len -= eat;
- skb->len -= eat;
- offset = headlen;
- }
-
- __skb_pull(skb, offset);
-
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
- NAPI_GRO_CB(p)->last = skb;
- __skb_header_release(skb);
- lp = p;
-
-done:
- NAPI_GRO_CB(p)->count++;
- p->data_len += len;
- p->truesize += delta_truesize;
- p->len += len;
- if (lp != p) {
- lp->data_len += len;
- lp->truesize += delta_truesize;
- lp->len += len;
- }
- NAPI_GRO_CB(skb)->same_flow = 1;
- return 0;
-}
-
#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE 8
#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
@@ -4849,9 +4892,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM)
- serr->ee.ee_data -= sk->sk_tskey;
+ if (sk_is_tcp(sk))
+ serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
}
err = sock_queue_err_skb(sk, skb);
@@ -4864,7 +4906,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
bool ret;
- if (likely(sysctl_tstamp_allow_data || tsonly))
+ if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
return true;
read_lock_bh(&sk->sk_callback_lock);
@@ -4919,8 +4961,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (tsonly) {
#ifdef CONFIG_INET
if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
- sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
+ sk_is_tcp(sk)) {
skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
ack_skb);
opt_stats = true;
@@ -4942,7 +4983,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (hwtstamps)
*skb_hwtstamps(skb) = *hwtstamps;
else
- skb->tstamp = ktime_get_real();
+ __net_timestamp(skb);
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
@@ -5366,11 +5407,18 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (skb_cloned(to))
return false;
- /* The page pool signature of struct page will eventually figure out
- * which pages can be recycled or not but for now let's prohibit slab
- * allocated and page_pool allocated SKBs from being coalesced.
+ /* In general, avoid mixing slab allocated and page_pool allocated
+ * pages within the same SKB. However when @to is not pp_recycle and
+ * @from is cloned, we can transition frag pages from page_pool to
+ * reference counted.
+ *
+ * On the other hand, don't allow coalescing two pp_recycle SKBs if
+ * @from is cloned, in case the SKB is using page_pool fragment
+ * references (PP_FLAG_PAGE_FRAG). Since we only take full page
+ * references for cloned SKBs at the moment that would result in
+ * inconsistent reference counts.
*/
- if (to->pp_recycle != from->pp_recycle)
+ if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from)))
return false;
if (len <= skb_tailroom(to)) {
@@ -5472,7 +5520,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
ipvs_reset(skb);
skb->mark = 0;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
@@ -5684,7 +5732,7 @@ err_free:
}
EXPORT_SYMBOL(skb_vlan_untag);
-int skb_ensure_writable(struct sk_buff *skb, int write_len)
+int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
{
if (!pskb_may_pull(skb, write_len))
return -ENOMEM;
@@ -6166,11 +6214,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb->head = data;
skb->data = data;
skb->head_frag = 0;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
-#else
- skb->end = skb->head + size;
-#endif
+ skb_set_end_offset(skb, size);
skb_set_tail_pointer(skb, skb_headlen(skb));
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
@@ -6227,7 +6271,7 @@ static int pskb_carve_frag_list(struct sk_buff *skb,
/* Free pulled out fragments. */
while ((list = shinfo->frag_list) != insp) {
shinfo->frag_list = list->next;
- kfree_skb(list);
+ consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
@@ -6308,11 +6352,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb->head = data;
skb->head_frag = 0;
skb->data = data;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
-#else
- skb->end = skb->head + size;
-#endif
+ skb_set_end_offset(skb, size);
skb_reset_tail_pointer(skb);
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
@@ -6577,3 +6617,49 @@ free_now:
}
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */
+
+/**
+ * skb_attempt_defer_free - queue skb for remote freeing
+ * @skb: buffer
+ *
+ * Put @skb in a per-cpu list, using the cpu which
+ * allocated the skb/pages to reduce false sharing
+ * and memory zone spinlock contention.
+ */
+void skb_attempt_defer_free(struct sk_buff *skb)
+{
+ int cpu = skb->alloc_cpu;
+ struct softnet_data *sd;
+ unsigned long flags;
+ unsigned int defer_max;
+ bool kick;
+
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+ !cpu_online(cpu) ||
+ cpu == raw_smp_processor_id()) {
+nodefer: __kfree_skb(skb);
+ return;
+ }
+
+ sd = &per_cpu(softnet_data, cpu);
+ defer_max = READ_ONCE(sysctl_skb_defer_max);
+ if (READ_ONCE(sd->defer_count) >= defer_max)
+ goto nodefer;
+
+ spin_lock_irqsave(&sd->defer_lock, flags);
+ /* Send an IPI every time queue reaches half capacity. */
+ kick = sd->defer_count == (defer_max >> 1);
+ /* Paired with the READ_ONCE() few lines above */
+ WRITE_ONCE(sd->defer_count, sd->defer_count + 1);
+
+ skb->next = sd->defer_list;
+ /* Paired with READ_ONCE() in skb_defer_free_flush() */
+ WRITE_ONCE(sd->defer_list, skb);
+ spin_unlock_irqrestore(&sd->defer_lock, flags);
+
+ /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
+ * if we are unlucky enough (this seems very unlikely).
+ */
+ if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
+ smp_call_function_single_async(cpu, &sd->defer_csd);
+}