From 787bea7748a76130566f881c2342a0be4127d182 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:43 -0700 Subject: inet: frags: change inet_frags_init_net() return value We will soon initialize one rhashtable per struct netns_frags in inet_frags_init_net(). This patch changes the return value to eventually propagate an error. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index bbf1b94942c0..e0b39d4ecbd4 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -846,6 +846,8 @@ static void __init ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { + int res; + /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -871,9 +873,13 @@ static int __net_init ipv4_frags_init_net(struct net *net) net->ipv4.frags.max_dist = 64; - inet_frags_init_net(&net->ipv4.frags); - - return ip4_frags_ns_ctl_register(net); + res = inet_frags_init_net(&net->ipv4.frags); + if (res < 0) + return res; + res = ip4_frags_ns_ctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); + return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) -- cgit v1.2.3-59-g8ed1b From 093ba72914b696521e4885756a68a3332782c8de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:44 -0700 Subject: inet: frags: add a pointer to struct netns_frags In order to simplify the API, add a pointer to struct inet_frags. This will allow us to make things less complex. These functions no longer have a struct inet_frags parameter : inet_frag_destroy(struct inet_frag_queue *q /*, struct inet_frags *f */) inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */) inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */) inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */) ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 11 ++++++----- include/net/ipv6.h | 3 +-- net/ieee802154/6lowpan/reassembly.c | 13 +++++++------ net/ipv4/inet_fragment.c | 17 ++++++++++------- net/ipv4/ip_fragment.c | 9 +++++---- net/ipv6/netfilter/nf_conntrack_reasm.c | 16 +++++++++------- net/ipv6/reassembly.c | 20 ++++++++++---------- 7 files changed, 48 insertions(+), 41 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index b1d62176f3b4..69e531ed8189 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -10,6 +10,7 @@ struct netns_frags { int high_thresh; int low_thresh; int max_dist; + struct inet_frags *f; }; /** @@ -109,20 +110,20 @@ static inline int inet_frags_init_net(struct netns_frags *nf) atomic_set(&nf->mem, 0); return 0; } -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); +void inet_frags_exit_net(struct netns_frags *nf); -void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f); +void inet_frag_kill(struct inet_frag_queue *q); +void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash); void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix); -static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) +static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f); + inet_frag_destroy(q); } static inline bool inet_frag_evicting(struct inet_frag_queue *q) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5c18836672e9..57b7fe43d2ab 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -607,8 +607,7 @@ struct frag_queue { u8 ecn; }; -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, - struct inet_frags *frags); +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq); static inline bool ipv6_addr_any(const struct in6_addr *a) { diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 2aaab4bba429..6badc055555b 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -94,10 +94,10 @@ static void lowpan_frag_expire(struct timer_list *t) if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q, &lowpan_frags); + inet_frag_kill(&fq->q); out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &lowpan_frags); + inet_frag_put(&fq->q); } static inline struct lowpan_frag_queue * @@ -230,7 +230,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct sk_buff *fp, *head = fq->q.fragments; int sum_truesize; - inet_frag_kill(&fq->q, &lowpan_frags); + inet_frag_kill(&fq->q); /* Make the one we just received the head. */ if (prev) { @@ -438,7 +438,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &lowpan_frags); + inet_frag_put(&fq->q); return ret; } @@ -586,13 +586,14 @@ static int __net_init lowpan_frags_init_net(struct net *net) ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; + ieee802154_lowpan->frags.f = &lowpan_frags; res = inet_frags_init_net(&ieee802154_lowpan->frags); if (res < 0) return res; res = lowpan_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); + inet_frags_exit_net(&ieee802154_lowpan->frags); return res; } @@ -602,7 +603,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net) net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); + inet_frags_exit_net(&ieee802154_lowpan->frags); } static struct pernet_operations lowpan_frags_ops = { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e8ec28999f5c..1ac69f65d0de 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -219,8 +219,9 @@ void inet_frags_fini(struct inet_frags *f) } EXPORT_SYMBOL(inet_frags_fini); -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) +void inet_frags_exit_net(struct netns_frags *nf) { + struct inet_frags *f =nf->f; unsigned int seq; int i; @@ -264,33 +265,34 @@ __acquires(hb->chain_lock) return hb; } -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +static inline void fq_unlink(struct inet_frag_queue *fq) { struct inet_frag_bucket *hb; - hb = get_frag_bucket_locked(fq, f); + hb = get_frag_bucket_locked(fq, fq->net->f); hlist_del(&fq->list); fq->flags |= INET_FRAG_COMPLETE; spin_unlock(&hb->chain_lock); } -void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) +void inet_frag_kill(struct inet_frag_queue *fq) { if (del_timer(&fq->timer)) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - fq_unlink(fq, f); + fq_unlink(fq); refcount_dec(&fq->refcnt); } } EXPORT_SYMBOL(inet_frag_kill); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) +void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; struct netns_frags *nf; unsigned int sum, sum_truesize = 0; + struct inet_frags *f; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); @@ -298,6 +300,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) /* Release all fragment data. */ fp = q->fragments; nf = q->net; + f = nf->f; while (fp) { struct sk_buff *xp = fp->next; @@ -333,7 +336,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, refcount_inc(&qp->refcnt); spin_unlock(&hb->chain_lock); qp_in->flags |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in, f); + inet_frag_put(qp_in); return qp; } } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e0b39d4ecbd4..cd2b4c9419fc 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -168,7 +168,7 @@ static void ip4_frag_free(struct inet_frag_queue *q) static void ipq_put(struct ipq *ipq) { - inet_frag_put(&ipq->q, &ip4_frags); + inet_frag_put(&ipq->q); } /* Kill ipq entry. It is not destroyed immediately, @@ -176,7 +176,7 @@ static void ipq_put(struct ipq *ipq) */ static void ipq_kill(struct ipq *ipq) { - inet_frag_kill(&ipq->q, &ip4_frags); + inet_frag_kill(&ipq->q); } static bool frag_expire_skip_icmp(u32 user) @@ -872,20 +872,21 @@ static int __net_init ipv4_frags_init_net(struct net *net) net->ipv4.frags.timeout = IP_FRAG_TIME; net->ipv4.frags.max_dist = 64; + net->ipv4.frags.f = &ip4_frags; res = inet_frags_init_net(&net->ipv4.frags); if (res < 0) return res; res = ip4_frags_ns_ctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); + inet_frags_exit_net(&net->ipv4.frags); return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); + inet_frags_exit_net(&net->ipv4.frags); } static struct pernet_operations ip4_frags_ops = { diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 6ff41569134a..c4b40fdee838 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -178,7 +178,7 @@ static void nf_ct_frag6_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6_expire_frag_queue(net, fq, &nf_frags); + ip6_expire_frag_queue(net, fq); } /* Creation primitives. */ @@ -264,7 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); return -EPROTO; } if (end > fq->q.len) { @@ -357,7 +357,7 @@ found: return 0; discard_fq: - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); err: return -EINVAL; } @@ -379,7 +379,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic int payload_len; u8 ecn; - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); WARN_ON(head == NULL); WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); @@ -622,7 +622,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) out_unlock: spin_unlock_bh(&fq->q.lock); - inet_frag_put(&fq->q, &nf_frags); + inet_frag_put(&fq->q); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); @@ -634,19 +634,21 @@ static int nf_ct_net_init(struct net *net) net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; + net->nf_frag.frags.f = &nf_frags; + res = inet_frags_init_net(&net->nf_frag.frags); if (res < 0) return res; res = nf_ct_frag6_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); + inet_frags_exit_net(&net->nf_frag.frags); return res; } static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); + inet_frags_exit_net(&net->nf_frag.frags); } static struct pernet_operations nf_ct_net_ops = { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index a8f7a5f0251a..4855de6f673a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -128,8 +128,7 @@ void ip6_frag_init(struct inet_frag_queue *q, const void *a) } EXPORT_SYMBOL(ip6_frag_init); -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, - struct inet_frags *frags) +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; @@ -138,7 +137,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q, frags); + inet_frag_kill(&fq->q); rcu_read_lock(); dev = dev_get_by_index_rcu(net, fq->iif); @@ -166,7 +165,7 @@ out_rcu_unlock: rcu_read_unlock(); out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, frags); + inet_frag_put(&fq->q); } EXPORT_SYMBOL(ip6_expire_frag_queue); @@ -179,7 +178,7 @@ static void ip6_frag_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); - ip6_expire_frag_queue(net, fq, &ip6_frags); + ip6_expire_frag_queue(net, fq); } static struct frag_queue * @@ -364,7 +363,7 @@ found: return -1; discard_fq: - inet_frag_kill(&fq->q, &ip6_frags); + inet_frag_kill(&fq->q); err: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); @@ -391,7 +390,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, int sum_truesize; u8 ecn; - inet_frag_kill(&fq->q, &ip6_frags); + inet_frag_kill(&fq->q); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -569,7 +568,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &ip6_frags); + inet_frag_put(&fq->q); return ret; } @@ -716,6 +715,7 @@ static int __net_init ipv6_frags_init_net(struct net *net) net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; + net->ipv6.frags.f = &ip6_frags; res = inet_frags_init_net(&net->ipv6.frags); if (res < 0) @@ -723,14 +723,14 @@ static int __net_init ipv6_frags_init_net(struct net *net) res = ip6_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); + inet_frags_exit_net(&net->ipv6.frags); return res; } static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); + inet_frags_exit_net(&net->ipv6.frags); } static struct pernet_operations ip6_frags_ops = { -- cgit v1.2.3-59-g8ed1b From 483a6e4fa055123142d8956866fe2aa9c98d546d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:47 -0700 Subject: inet: frags: refactor ipfrag_init() We need to call inet_frags_init() before register_pernet_subsys(), as a prereq for following patch ("inet: frags: use rhashtables for reassembly units") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cd2b4c9419fc..1a3bc85d6f5e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -896,8 +896,6 @@ static struct pernet_operations ip4_frags_ops = { void __init ipfrag_init(void) { - ip4_frags_ctl_register(); - register_pernet_subsys(&ip4_frags_ops); ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; @@ -907,4 +905,6 @@ void __init ipfrag_init(void) ip4_frags.frags_cache_name = ip_frag_cache_name; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); + ip4_frags_ctl_register(); + register_pernet_subsys(&ip4_frags_ops); } -- cgit v1.2.3-59-g8ed1b From 648700f76b03b7e8149d13cc2bdb3355035258a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:49 -0700 Subject: inet: frags: use rhashtables for reassembly units Some applications still rely on IP fragmentation, and to be fair linux reassembly unit is not working under any serious load. It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!) A work queue is supposed to garbage collect items when host is under memory pressure, and doing a hash rebuild, changing seed used in hash computations. This work queue blocks softirqs for up to 25 ms when doing a hash rebuild, occurring every 5 seconds if host is under fire. Then there is the problem of sharing this hash table for all netns. It is time to switch to rhashtables, and allocate one of them per netns to speedup netns dismantle, since this is a critical metric these days. Lookup is now using RCU. A followup patch will even remove the refcount hold/release left from prior implementation and save a couple of atomic operations. Before this patch, 16 cpus (16 RX queue NIC) could not handle more than 1 Mpps frags DDOS. After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB of storage for the fragments (exact number depends on frags being evicted after timeout) $ grep FRAG /proc/net/sockstat FRAG: inuse 1966916 memory 2140004608 A followup patch will change the limits for 64bit arches. Signed-off-by: Eric Dumazet Cc: Kirill Tkhai Cc: Herbert Xu Cc: Florian Westphal Cc: Jesper Dangaard Brouer Cc: Alexander Aring Cc: Stefan Schmidt Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +- include/net/inet_frag.h | 81 ++++---- include/net/ipv6.h | 16 +- net/ieee802154/6lowpan/6lowpan_i.h | 26 +-- net/ieee802154/6lowpan/reassembly.c | 91 ++++----- net/ipv4/inet_fragment.c | 344 ++++++-------------------------- net/ipv4/ip_fragment.c | 112 +++++------ net/ipv6/netfilter/nf_conntrack_reasm.c | 51 ++--- net/ipv6/reassembly.c | 110 +++++----- 9 files changed, 265 insertions(+), 573 deletions(-) (limited to 'net/ipv4') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 33f35f049ad5..6f2a3670e44b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -134,13 +134,10 @@ min_adv_mss - INTEGER IP Fragmentation: ipfrag_high_thresh - INTEGER - Maximum memory used to reassemble IP fragments. When - ipfrag_high_thresh bytes of memory is allocated for this purpose, - the fragment handler will toss packets until ipfrag_low_thresh - is reached. This also serves as a maximum limit to namespaces - different from the initial one. + Maximum memory used to reassemble IP fragments. ipfrag_low_thresh - INTEGER + (Obsolete since linux-4.17) Maximum memory used to reassemble IP fragments before the kernel begins to remove incomplete fragment queues to free up resources. The kernel still accepts new fragments for defragmentation. diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 69e531ed8189..3fec0d3a0d01 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -2,7 +2,11 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ +#include + struct netns_frags { + struct rhashtable rhashtable ____cacheline_aligned_in_smp; + /* Keep atomic mem on separate cachelines in structs that include it */ atomic_t mem ____cacheline_aligned_in_smp; /* sysctls */ @@ -26,12 +30,30 @@ enum { INET_FRAG_COMPLETE = BIT(2), }; +struct frag_v4_compare_key { + __be32 saddr; + __be32 daddr; + u32 user; + u32 vif; + __be16 id; + u16 protocol; +}; + +struct frag_v6_compare_key { + struct in6_addr saddr; + struct in6_addr daddr; + u32 user; + __be32 id; + u32 iif; +}; + /** * struct inet_frag_queue - fragment queue * - * @lock: spinlock protecting the queue + * @node: rhash node + * @key: keys identifying this frag. * @timer: queue expiration timer - * @list: hash bucket list + * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @fragments: received fragments head * @fragments_tail: received fragments tail @@ -41,12 +63,16 @@ enum { * @flags: fragment queue flags * @max_size: maximum received fragment size * @net: namespace that this frag belongs to - * @list_evictor: list of queues to forcefully evict (e.g. due to low memory) + * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { - spinlock_t lock; + struct rhash_head node; + union { + struct frag_v4_compare_key v4; + struct frag_v6_compare_key v6; + } key; struct timer_list timer; - struct hlist_node list; + spinlock_t lock; refcount_t refcnt; struct sk_buff *fragments; struct sk_buff *fragments_tail; @@ -55,51 +81,20 @@ struct inet_frag_queue { int meat; __u8 flags; u16 max_size; - struct netns_frags *net; - struct hlist_node list_evictor; -}; - -#define INETFRAGS_HASHSZ 1024 - -/* averaged: - * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / - * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or - * struct frag_queue)) - */ -#define INETFRAGS_MAXDEPTH 128 - -struct inet_frag_bucket { - struct hlist_head chain; - spinlock_t chain_lock; + struct netns_frags *net; + struct rcu_head rcu; }; struct inet_frags { - struct inet_frag_bucket hash[INETFRAGS_HASHSZ]; - - struct work_struct frags_work; - unsigned int next_bucket; - unsigned long last_rebuild_jiffies; - bool rebuild; - - /* The first call to hashfn is responsible to initialize - * rnd. This is best done with net_get_random_once. - * - * rnd_seqlock is used to let hash insertion detect - * when it needs to re-lookup the hash chain to use. - */ - u32 rnd; - seqlock_t rnd_seqlock; unsigned int qsize; - unsigned int (*hashfn)(const struct inet_frag_queue *); - bool (*match)(const struct inet_frag_queue *q, - const void *arg); void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; + struct rhashtable_params rhash_params; }; int inet_frags_init(struct inet_frags *); @@ -108,15 +103,13 @@ void inet_frags_fini(struct inet_frags *); static inline int inet_frags_init_net(struct netns_frags *nf) { atomic_set(&nf->mem, 0); - return 0; + return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); } void inet_frags_exit_net(struct netns_frags *nf); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, unsigned int hash); - +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix); @@ -128,7 +121,7 @@ static inline void inet_frag_put(struct inet_frag_queue *q) static inline bool inet_frag_evicting(struct inet_frag_queue *q) { - return !hlist_unhashed(&q->list_evictor); + return false; } /* Memory Tracking Functions. */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 57b7fe43d2ab..6fa9a2bc5896 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -579,17 +579,8 @@ enum ip6_defrag_users { __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, }; -struct ip6_create_arg { - __be32 id; - u32 user; - const struct in6_addr *src; - const struct in6_addr *dst; - int iif; - u8 ecn; -}; - void ip6_frag_init(struct inet_frag_queue *q, const void *a); -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); +extern const struct rhashtable_params ip6_rhash_params; /* * Equivalent of ipv4 struct ip @@ -597,11 +588,6 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); struct frag_queue { struct inet_frag_queue q; - __be32 id; /* fragment id */ - u32 user; - struct in6_addr saddr; - struct in6_addr daddr; - int iif; __u16 nhoffset; u8 ecn; diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index d8de3bcfb103..b8d95cb71c25 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result; #define LOWPAN_DISPATCH_FRAG1 0xc0 #define LOWPAN_DISPATCH_FRAGN 0xe0 -struct lowpan_create_arg { +struct frag_lowpan_compare_key { u16 tag; u16 d_size; - const struct ieee802154_addr *src; - const struct ieee802154_addr *dst; + const struct ieee802154_addr src; + const struct ieee802154_addr dst; }; -/* Equivalent of ipv4 struct ip +/* Equivalent of ipv4 struct ipq */ struct lowpan_frag_queue { struct inet_frag_queue q; - - u16 tag; - u16 d_size; - struct ieee802154_addr saddr; - struct ieee802154_addr daddr; }; -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) -{ - switch (a->mode) { - case IEEE802154_ADDR_LONG: - return (((__force u64)a->extended_addr) >> 32) ^ - (((__force u64)a->extended_addr) & 0xffffffff); - case IEEE802154_ADDR_SHORT: - return (__force u32)(a->short_addr + (a->pan_id << 16)); - default: - return 0; - } -} - int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); void lowpan_net_frag_exit(void); int lowpan_net_frag_init(void); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index ddada12a044d..0fa0121f85d4 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct net_device *ldev); -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, - const struct ieee802154_addr *saddr, - const struct ieee802154_addr *daddr) -{ - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); - return jhash_3words(ieee802154_addr_hash(saddr), - ieee802154_addr_hash(daddr), - (__force u32)(tag + (d_size << 16)), - lowpan_frags.rnd); -} - -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) -{ - const struct lowpan_frag_queue *fq; - - fq = container_of(q, struct lowpan_frag_queue, q); - return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); -} - -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct lowpan_frag_queue *fq; - const struct lowpan_create_arg *arg = a; - - fq = container_of(q, struct lowpan_frag_queue, q); - return fq->tag == arg->tag && fq->d_size == arg->d_size && - ieee802154_addr_equal(&fq->saddr, arg->src) && - ieee802154_addr_equal(&fq->daddr, arg->dst); -} - static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { - const struct lowpan_create_arg *arg = a; + const struct frag_lowpan_compare_key *key = a; struct lowpan_frag_queue *fq; fq = container_of(q, struct lowpan_frag_queue, q); - fq->tag = arg->tag; - fq->d_size = arg->d_size; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; + BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); + memcpy(&q->key, key, sizeof(*key)); } static void lowpan_frag_expire(struct timer_list *t) @@ -105,21 +73,17 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { - struct inet_frag_queue *q; - struct lowpan_create_arg arg; - unsigned int hash; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + struct frag_lowpan_compare_key key = { + .tag = cb->d_tag, + .d_size = cb->d_size, + .src = *src, + .dst = *dst, + }; + struct inet_frag_queue *q; - arg.tag = cb->d_tag; - arg.d_size = cb->d_size; - arg.src = src; - arg.dst = dst; - - hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); - - q = inet_frag_find(&ieee802154_lowpan->frags, - &lowpan_frags, &arg, hash); + q = inet_frag_find(&ieee802154_lowpan->frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -611,17 +575,46 @@ static struct pernet_operations lowpan_frags_ops = { .exit = lowpan_frags_exit_net, }; +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} + +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} + +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_lowpan_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params lowpan_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = lowpan_key_hashfn, + .obj_hashfn = lowpan_obj_hashfn, + .obj_cmpfn = lowpan_obj_cmpfn, + .automatic_shrinking = true, +}; + int __init lowpan_net_frag_init(void) { int ret; - lowpan_frags.hashfn = lowpan_hashfn; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); - lowpan_frags.match = lowpan_frag_match; lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frags_cache_name = lowpan_frags_cache_name; + lowpan_frags.rhash_params = lowpan_rhash_params; ret = inet_frags_init(&lowpan_frags); if (ret) goto out; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 1ac69f65d0de..ebb8f411e0db 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,12 +25,6 @@ #include #include -#define INETFRAGS_EVICT_BUCKETS 128 -#define INETFRAGS_EVICT_MAX 512 - -/* don't rebuild inetfrag table with new secret more often than this */ -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) - /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static unsigned int -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) -{ - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); -} - -static bool inet_frag_may_rebuild(struct inet_frags *f) -{ - return time_after(jiffies, - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); -} - -static void inet_frag_secret_rebuild(struct inet_frags *f) -{ - int i; - - write_seqlock_bh(&f->rnd_seqlock); - - if (!inet_frag_may_rebuild(f)) - goto out; - - get_random_bytes(&f->rnd, sizeof(u32)); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - struct hlist_node *n; - - hb = &f->hash[i]; - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = inet_frag_hashfn(f, q); - - if (hval != i) { - struct inet_frag_bucket *hb_dest; - - hlist_del(&q->list); - - /* Relink to new hash chain. */ - hb_dest = &f->hash[hval]; - - /* This is the only place where we take - * another chain_lock while already holding - * one. As this will not run concurrently, - * we cannot deadlock on hb_dest lock below, if its - * already locked it will be released soon since - * other caller cannot be waiting for hb lock - * that we've taken above. - */ - spin_lock_nested(&hb_dest->chain_lock, - SINGLE_DEPTH_NESTING); - hlist_add_head(&q->list, &hb_dest->chain); - spin_unlock(&hb_dest->chain_lock); - } - } - spin_unlock(&hb->chain_lock); - } - - f->rebuild = false; - f->last_rebuild_jiffies = jiffies; -out: - write_sequnlock_bh(&f->rnd_seqlock); -} - -static bool inet_fragq_should_evict(const struct inet_frag_queue *q) -{ - if (!hlist_unhashed(&q->list_evictor)) - return false; - - return q->net->low_thresh == 0 || - frag_mem_limit(q->net) >= q->net->low_thresh; -} - -static unsigned int -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) -{ - struct inet_frag_queue *fq; - struct hlist_node *n; - unsigned int evicted = 0; - HLIST_HEAD(expired); - - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &hb->chain, list) { - if (!inet_fragq_should_evict(fq)) - continue; - - if (!del_timer(&fq->timer)) - continue; - - hlist_add_head(&fq->list_evictor, &expired); - ++evicted; - } - - spin_unlock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire(&fq->timer); - - return evicted; -} - -static void inet_frag_worker(struct work_struct *work) -{ - unsigned int budget = INETFRAGS_EVICT_BUCKETS; - unsigned int i, evicted = 0; - struct inet_frags *f; - - f = container_of(work, struct inet_frags, frags_work); - - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); - - local_bh_disable(); - - for (i = READ_ONCE(f->next_bucket); budget; --budget) { - evicted += inet_evict_bucket(f, &f->hash[i]); - i = (i + 1) & (INETFRAGS_HASHSZ - 1); - if (evicted > INETFRAGS_EVICT_MAX) - break; - } - - f->next_bucket = i; - - local_bh_enable(); - - if (f->rebuild && inet_frag_may_rebuild(f)) - inet_frag_secret_rebuild(f); -} - -static void inet_frag_schedule_worker(struct inet_frags *f) -{ - if (unlikely(!work_pending(&f->frags_work))) - schedule_work(&f->frags_work); -} - int inet_frags_init(struct inet_frags *f) { - int i; - - INIT_WORK(&f->frags_work, inet_frag_worker); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb = &f->hash[i]; - - spin_lock_init(&hb->chain_lock); - INIT_HLIST_HEAD(&hb->chain); - } - - seqlock_init(&f->rnd_seqlock); - f->last_rebuild_jiffies = 0; f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) @@ -214,66 +59,42 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - cancel_work_sync(&f->frags_work); + /* We must wait that all inet_frag_destroy_rcu() have completed. */ + rcu_barrier(); + kmem_cache_destroy(f->frags_cachep); + f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); -void inet_frags_exit_net(struct netns_frags *nf) -{ - struct inet_frags *f =nf->f; - unsigned int seq; - int i; - - nf->low_thresh = 0; - -evict_again: - local_bh_disable(); - seq = read_seqbegin(&f->rnd_seqlock); - - for (i = 0; i < INETFRAGS_HASHSZ ; i++) - inet_evict_bucket(f, &f->hash[i]); - - local_bh_enable(); - cond_resched(); - - if (read_seqretry(&f->rnd_seqlock, seq) || - sum_frag_mem_limit(nf)) - goto evict_again; -} -EXPORT_SYMBOL(inet_frags_exit_net); - -static struct inet_frag_bucket * -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) -__acquires(hb->chain_lock) +static void inet_frags_free_cb(void *ptr, void *arg) { - struct inet_frag_bucket *hb; - unsigned int seq, hash; + struct inet_frag_queue *fq = ptr; - restart: - seq = read_seqbegin(&f->rnd_seqlock); - - hash = inet_frag_hashfn(f, fq); - hb = &f->hash[hash]; + /* If we can not cancel the timer, it means this frag_queue + * is already disappearing, we have nothing to do. + * Otherwise, we own a refcount until the end of this function. + */ + if (!del_timer(&fq->timer)) + return; - spin_lock(&hb->chain_lock); - if (read_seqretry(&f->rnd_seqlock, seq)) { - spin_unlock(&hb->chain_lock); - goto restart; + spin_lock_bh(&fq->lock); + if (!(fq->flags & INET_FRAG_COMPLETE)) { + fq->flags |= INET_FRAG_COMPLETE; + refcount_dec(&fq->refcnt); } + spin_unlock_bh(&fq->lock); - return hb; + inet_frag_put(fq); } -static inline void fq_unlink(struct inet_frag_queue *fq) +void inet_frags_exit_net(struct netns_frags *nf) { - struct inet_frag_bucket *hb; + nf->low_thresh = 0; /* prevent creation of new frags */ - hb = get_frag_bucket_locked(fq, fq->net->f); - hlist_del(&fq->list); - fq->flags |= INET_FRAG_COMPLETE; - spin_unlock(&hb->chain_lock); + rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } +EXPORT_SYMBOL(inet_frags_exit_net); void inet_frag_kill(struct inet_frag_queue *fq) { @@ -281,12 +102,26 @@ void inet_frag_kill(struct inet_frag_queue *fq) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - fq_unlink(fq); + struct netns_frags *nf = fq->net; + + fq->flags |= INET_FRAG_COMPLETE; + rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); refcount_dec(&fq->refcnt); } } EXPORT_SYMBOL(inet_frag_kill); +static void inet_frag_destroy_rcu(struct rcu_head *head) +{ + struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, + rcu); + struct inet_frags *f = q->net->f; + + if (f->destructor) + f->destructor(q); + kmem_cache_free(f->frags_cachep, q); +} + void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; @@ -310,59 +145,20 @@ void inet_frag_destroy(struct inet_frag_queue *q) } sum = sum_truesize + f->qsize; - if (f->destructor) - f->destructor(q); - kmem_cache_free(f->frags_cachep, q); + call_rcu(&q->rcu, inet_frag_destroy_rcu); sub_frag_mem_limit(nf, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, - struct inet_frags *f, - void *arg) -{ - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); - struct inet_frag_queue *qp; - -#ifdef CONFIG_SMP - /* With SMP race we have to recheck hash table, because - * such entry could have been created on other cpu before - * we acquired hash bucket lock. - */ - hlist_for_each_entry(qp, &hb->chain, list) { - if (qp->net == nf && f->match(qp, arg)) { - refcount_inc(&qp->refcnt); - spin_unlock(&hb->chain_lock); - qp_in->flags |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in); - return qp; - } - } -#endif - qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) - refcount_inc(&qp->refcnt); - - refcount_inc(&qp->refcnt); - hlist_add_head(&qp->list, &hb->chain); - - spin_unlock(&hb->chain_lock); - - return qp; -} - static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { - inet_frag_schedule_worker(f); + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; - } q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) @@ -374,59 +170,52 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 1); + refcount_set(&q->refcnt, 3); return q; } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg) { + struct inet_frags *f = nf->f; struct inet_frag_queue *q; + int err; q = inet_frag_alloc(nf, f, arg); if (!q) return NULL; - return inet_frag_intern(nf, q, f, arg); + mod_timer(&q->timer, jiffies + nf->timeout); + + err = rhashtable_insert_fast(&nf->rhashtable, &q->node, + f->rhash_params); + if (err < 0) { + q->flags |= INET_FRAG_COMPLETE; + inet_frag_kill(q); + inet_frag_destroy(q); + return NULL; + } + return q; } -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, - unsigned int hash) +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - int depth = 0; - - if (frag_mem_limit(nf) > nf->low_thresh) - inet_frag_schedule_worker(f); - - hash &= (INETFRAGS_HASHSZ - 1); - hb = &f->hash[hash]; - - spin_lock(&hb->chain_lock); - hlist_for_each_entry(q, &hb->chain, list) { - if (q->net == nf && f->match(q, key)) { - refcount_inc(&q->refcnt); - spin_unlock(&hb->chain_lock); - return q; - } - depth++; - } - spin_unlock(&hb->chain_lock); + struct inet_frag_queue *fq; - if (depth <= INETFRAGS_MAXDEPTH) - return inet_frag_create(nf, f, key); + rcu_read_lock(); - if (inet_frag_may_rebuild(f)) { - if (!f->rebuild) - f->rebuild = true; - inet_frag_schedule_worker(f); + fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (fq) { + if (!refcount_inc_not_zero(&fq->refcnt)) + fq = NULL; + rcu_read_unlock(); + return fq; } + rcu_read_unlock(); - return ERR_PTR(-ENOBUFS); + return inet_frag_create(nf, key); } EXPORT_SYMBOL(inet_frag_find); @@ -434,8 +223,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix) { static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) - ". Dropping fragment.\n"; + " list length grew over limit. Dropping fragment.\n"; if (PTR_ERR(q) == -ENOBUFS) net_dbg_ratelimited("%s%s", prefix, msg); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1a3bc85d6f5e..4021820db6f2 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -69,15 +69,9 @@ struct ipfrag_skb_cb struct ipq { struct inet_frag_queue q; - u32 user; - __be32 saddr; - __be32 daddr; - __be16 id; - u8 protocol; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; - int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net) static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev); -struct ip4_create_arg { - struct iphdr *iph; - u32 user; - int vif; -}; - -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) -{ - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); - return jhash_3words((__force u32)id << 16 | prot, - (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd); -} - -static unsigned int ip4_hashfn(const struct inet_frag_queue *q) -{ - const struct ipq *ipq; - - ipq = container_of(q, struct ipq, q); - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); -} - -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct ipq *qp; - const struct ip4_create_arg *arg = a; - - qp = container_of(q, struct ipq, q); - return qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user && - qp->vif == arg->vif; -} static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { @@ -140,17 +99,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) frags); struct net *net = container_of(ipv4, struct net, ipv4); - const struct ip4_create_arg *arg = a; + const struct frag_v4_compare_key *key = a; - qp->protocol = arg->iph->protocol; - qp->id = arg->iph->id; - qp->ecn = ip4_frag_ecn(arg->iph->tos); - qp->saddr = arg->iph->saddr; - qp->daddr = arg->iph->daddr; - qp->vif = arg->vif; - qp->user = arg->user; + q->key.v4 = *key; + qp->ecn = 0; qp->peer = q->net->max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -234,7 +188,7 @@ static void ip_expire(struct timer_list *t) /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (frag_expire_skip_icmp(qp->user) && + if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; @@ -262,17 +216,17 @@ out_rcu_unlock: static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { + struct frag_v4_compare_key key = { + .saddr = iph->saddr, + .daddr = iph->daddr, + .user = user, + .vif = vif, + .id = iph->id, + .protocol = iph->protocol, + }; struct inet_frag_queue *q; - struct ip4_create_arg arg; - unsigned int hash; - - arg.iph = iph; - arg.user = user; - arg.vif = vif; - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - - q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); + q = inet_frag_find(&net->ipv4.frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -656,7 +610,7 @@ out_nomem: err = -ENOMEM; goto out_fail; out_oversize: - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; @@ -894,15 +848,47 @@ static struct pernet_operations ip4_frags_ops = { .exit = ipv4_frags_exit_net, }; + +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v4, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v4_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params ip4_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .key_offset = offsetof(struct inet_frag_queue, key), + .key_len = sizeof(struct frag_v4_compare_key), + .hashfn = ip4_key_hashfn, + .obj_hashfn = ip4_obj_hashfn, + .obj_cmpfn = ip4_obj_cmpfn, + .automatic_shrinking = true, +}; + void __init ipfrag_init(void) { - ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); - ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; + ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); ip4_frags_ctl_register(); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c4b40fdee838..0ad3df551d98 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); -} - - -static unsigned int nf_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *nq; - - nq = container_of(q, struct frag_queue, q); - return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); -} - static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); @@ -182,26 +165,19 @@ static void nf_ct_frag6_expire(struct timer_list *t) } /* Creation primitives. */ -static inline struct frag_queue *fq_find(struct net *net, __be32 id, - u32 user, struct in6_addr *src, - struct in6_addr *dst, int iif, u8 ecn) +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, + const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = user, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = user; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - - local_bh_disable(); - hash = nf_hash_frag(id, src, dst); - q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); - local_bh_enable(); + q = inet_frag_find(&net->nf_frag.frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -593,8 +569,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) fhdr = (struct frag_hdr *)skb_transport_header(skb); skb_orphan(skb); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + fq = fq_find(net, fhdr->identification, user, hdr, + skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; @@ -660,13 +636,12 @@ int nf_ct_frag6_init(void) { int ret = 0; - nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); - nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; + nf_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index f0071b113a92..3fc853e4492a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -79,52 +79,13 @@ static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); -/* - * callers should be careful not to use the hash value outside the ipfrag_lock - * as doing so could race with ipfrag_hash_rnd being recalculated. - */ -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, ip6_frags.rnd); -} - -static unsigned int ip6_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *fq; - - fq = container_of(q, struct frag_queue, q); - return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); -} - -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct frag_queue *fq; - const struct ip6_create_arg *arg = a; - - fq = container_of(q, struct frag_queue, q); - return fq->id == arg->id && - fq->user == arg->user && - ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst) && - (arg->iif == fq->iif || - !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | - IPV6_ADDR_LINKLOCAL))); -} -EXPORT_SYMBOL(ip6_frag_match); - void ip6_frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); - const struct ip6_create_arg *arg = a; + const struct frag_v6_compare_key *key = a; - fq->id = arg->id; - fq->user = arg->user; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; - fq->ecn = arg->ecn; + q->key.v6 = *key; + fq->ecn = 0; } EXPORT_SYMBOL(ip6_frag_init); @@ -182,23 +143,22 @@ static void ip6_frag_expire(struct timer_list *t) } static struct frag_queue * -fq_find(struct net *net, __be32 id, const struct in6_addr *src, - const struct in6_addr *dst, int iif, u8 ecn) +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = IP6_DEFRAG_LOCAL_DELIVER, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = IP6_DEFRAG_LOCAL_DELIVER; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - hash = inet6_hash_frag(id, src, dst); + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))) + key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); + q = inet_frag_find(&net->ipv6.frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -530,6 +490,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); + int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -558,13 +519,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + iif = skb->dev ? skb->dev->ifindex : 0; + fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { int ret; spin_lock(&fq->q.lock); + fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); spin_unlock(&fq->q.lock); @@ -738,17 +700,47 @@ static struct pernet_operations ip6_frags_ops = { .exit = ipv6_frags_exit_net, }; +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v6, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v6_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +const struct rhashtable_params ip6_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = ip6_key_hashfn, + .obj_hashfn = ip6_obj_hashfn, + .obj_cmpfn = ip6_obj_cmpfn, + .automatic_shrinking = true, +}; +EXPORT_SYMBOL(ip6_rhash_params); + int __init ipv6_frag_init(void) { int ret; - ip6_frags.hashfn = ip6_hashfn; ip6_frags.constructor = ip6_frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); - ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; ip6_frags.frags_cache_name = ip6_frag_cache_name; + ip6_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&ip6_frags); if (ret) goto out; -- cgit v1.2.3-59-g8ed1b From 6befe4a78b1553edb6eed3a78b4bcd9748526672 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:50 -0700 Subject: inet: frags: remove some helpers Remove sum_frag_mem_limit(), ip_frag_mem() & ip6_frag_mem() Also since we use rhashtable we can bring back the number of fragments in "grep FRAG /proc/net/sockstat /proc/net/sockstat6" that was removed in commit 434d305405ab ("inet: frag: don't account number of fragment queues") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 5 ----- include/net/ip.h | 1 - include/net/ipv6.h | 7 ------- net/ipv4/ip_fragment.c | 5 ----- net/ipv4/proc.c | 6 +++--- net/ipv6/proc.c | 5 +++-- 6 files changed, 6 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 3fec0d3a0d01..4b5449df0aad 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -141,11 +141,6 @@ static inline void add_frag_mem_limit(struct netns_frags *nf, int i) atomic_add(i, &nf->mem); } -static inline int sum_frag_mem_limit(struct netns_frags *nf) -{ - return atomic_read(&nf->mem); -} - /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. diff --git a/include/net/ip.h b/include/net/ip.h index 36f8f7811093..ecffd843e7b8 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s return skb; } #endif -int ip_frag_mem(struct net *net); /* * Functions provided by ip_forward.c diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6fa9a2bc5896..37455e840347 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev) idev->cnf.accept_ra; } -#if IS_ENABLED(CONFIG_IPV6) -static inline int ip6_frag_mem(struct net *net) -{ - return sum_frag_mem_limit(&net->ipv6.frags); -} -#endif - #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 4021820db6f2..44f4fa306e22 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -83,11 +83,6 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -int ip_frag_mem(struct net *net) -{ - return sum_frag_mem_limit(&net->ipv4.frags); -} - static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index adfb75340275..aacfce0d7d82 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -54,7 +54,6 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; - unsigned int frag_mem; int orphans, sockets; orphans = percpu_counter_sum_positive(&tcp_orphan_count); @@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - frag_mem = ip_frag_mem(net); - seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); + seq_printf(seq, "FRAG: inuse %u memory %u\n", + atomic_read(&net->ipv4.frags.rhashtable.nelems), + frag_mem_limit(&net->ipv4.frags)); return 0; } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 6e57028d2e91..8befeb91e071 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -38,7 +38,6 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; - unsigned int frag_mem = ip6_frag_mem(net); seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); @@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); + seq_printf(seq, "FRAG6: inuse %u memory %u\n", + atomic_read(&net->ipv6.frags.rhashtable.nelems), + frag_mem_limit(&net->ipv6.frags)); return 0; } -- cgit v1.2.3-59-g8ed1b From 399d1404be660d355192ff4df5ccc3f4159ec1e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:51 -0700 Subject: inet: frags: get rif of inet_frag_evicting() This refactors ip_expire() since one indentation level is removed. Note: in the future, we should try hard to avoid the skb_clone() since this is a serious performance cost. Under DDOS, the ICMP message wont be sent because of rate limits. Fact that ip6_expire_frag_queue() does not use skb_clone() is disturbing too. Presumably IPv6 should have the same issue than the one we fixed in commit ec4fbd64751d ("inet: frag: release spinlock before calling icmp_send()") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 5 ---- net/ipv4/ip_fragment.c | 65 ++++++++++++++++++++++++------------------------- net/ipv6/reassembly.c | 4 --- 3 files changed, 32 insertions(+), 42 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 4b5449df0aad..0e8e159d88f7 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -119,11 +119,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q) inet_frag_destroy(q); } -static inline bool inet_frag_evicting(struct inet_frag_queue *q) -{ - return false; -} - /* Memory Tracking Functions. */ static inline int frag_mem_limit(struct netns_frags *nf) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 44f4fa306e22..b844f517b75b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -143,8 +143,11 @@ static bool frag_expire_skip_icmp(u32 user) static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); - struct ipq *qp; + struct sk_buff *clone, *head; + const struct iphdr *iph; struct net *net; + struct ipq *qp; + int err; qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); @@ -158,45 +161,41 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - if (!inet_frag_evicting(&qp->q)) { - struct sk_buff *clone, *head = qp->q.fragments; - const struct iphdr *iph; - int err; + head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); + __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) - goto out; + if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; - head->dev = dev_get_by_index_rcu(net, qp->iif); - if (!head->dev) - goto out; + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out; - /* skb has no dst, perform route lookup again */ - iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, + /* skb has no dst, perform route lookup again */ + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); - if (err) - goto out; + if (err) + goto out; - /* Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (frag_expire_skip_icmp(qp->q.key.v4.user) && - (skb_rtable(head)->rt_type != RTN_LOCAL)) - goto out; - - clone = skb_clone(head, GFP_ATOMIC); - - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if (clone) { - spin_unlock(&qp->q.lock); - icmp_send(clone, ICMP_TIME_EXCEEDED, - ICMP_EXC_FRAGTIME, 0); - consume_skb(clone); - goto out_rcu_unlock; - } + /* Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (frag_expire_skip_icmp(qp->q.key.v4.user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) + goto out; + + clone = skb_clone(head, GFP_ATOMIC); + + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + if (clone) { + spin_unlock(&qp->q.lock); + icmp_send(clone, ICMP_TIME_EXCEEDED, + ICMP_EXC_FRAGTIME, 0); + consume_skb(clone); + goto out_rcu_unlock; } out: spin_unlock(&qp->q.lock); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 3fc853e4492a..70acad126d04 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -106,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) goto out_rcu_unlock; __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - - if (inet_frag_evicting(&fq->q)) - goto out_rcu_unlock; - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ -- cgit v1.2.3-59-g8ed1b From 2d44ed22e607f9a285b049de2263e3840673a260 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:52 -0700 Subject: inet: frags: remove inet_frag_maybe_warn_overflow() This function is obsolete, after rhashtable addition to inet defrag. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 -- net/ieee802154/6lowpan/reassembly.c | 5 ++--- net/ipv4/inet_fragment.c | 11 ----------- net/ipv4/ip_fragment.c | 5 ++--- net/ipv6/netfilter/nf_conntrack_reasm.c | 5 ++--- net/ipv6/reassembly.c | 5 ++--- 6 files changed, 8 insertions(+), 25 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 0e8e159d88f7..95e353e3305b 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -110,8 +110,6 @@ void inet_frags_exit_net(struct netns_frags *nf); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix); static inline void inet_frag_put(struct inet_frag_queue *q) { diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 0fa0121f85d4..1aec71a3f904 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -84,10 +84,9 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, struct inet_frag_queue *q; q = inet_frag_find(&ieee802154_lowpan->frags, &key); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + if (!q) return NULL; - } + return container_of(q, struct lowpan_frag_queue, q); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index ebb8f411e0db..c9e35b81d093 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -218,14 +218,3 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) return inet_frag_create(nf, key); } EXPORT_SYMBOL(inet_frag_find); - -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix) -{ - static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit. Dropping fragment.\n"; - - if (PTR_ERR(q) == -ENOBUFS) - net_dbg_ratelimited("%s%s", prefix, msg); -} -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b844f517b75b..b0366224f314 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -221,10 +221,9 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, struct inet_frag_queue *q; q = inet_frag_find(&net->ipv4.frags, &key); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + if (!q) return NULL; - } + return container_of(q, struct ipq, q); } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0ad3df551d98..d866412b8f6c 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -178,10 +178,9 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, struct inet_frag_queue *q; q = inet_frag_find(&net->nf_frag.frags, &key); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + if (!q) return NULL; - } + return container_of(q, struct frag_queue, q); } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 70acad126d04..2a77fda5e3bc 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -155,10 +155,9 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) key.iif = 0; q = inet_frag_find(&net->ipv6.frags, &key); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + if (!q) return NULL; - } + return container_of(q, struct frag_queue, q); } -- cgit v1.2.3-59-g8ed1b From 3e67f106f619dcfaf6f4e2039599bdb69848c714 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:53 -0700 Subject: inet: frags: break the 2GB limit for frags storage Some users are willing to provision huge amounts of memory to be able to perform reassembly reasonnably well under pressure. Current memory tracking is using one atomic_t and integers. Switch to atomic_long_t so that 64bit arches can use more than 2GB, without any cost for 32bit arches. Note that this patch avoids an overflow error, if high_thresh was set to ~2GB, since this test in inet_frag_alloc() was never true : if (... || frag_mem_limit(nf) > nf->high_thresh) Tested: $ echo 16000000000 >/proc/sys/net/ipv4/ipfrag_high_thresh $ grep FRAG /proc/net/sockstat FRAG: inuse 14705885 memory 16000002880 $ nstat -n ; sleep 1 ; nstat | grep Reas IpReasmReqds 3317150 0.0 IpReasmFails 3317112 0.0 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 4 ++-- include/net/inet_frag.h | 20 ++++++++++---------- net/ieee802154/6lowpan/reassembly.c | 10 +++++----- net/ipv4/ip_fragment.c | 10 +++++----- net/ipv4/proc.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++++----- net/ipv6/proc.c | 2 +- net/ipv6/reassembly.c | 6 +++--- 8 files changed, 32 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 6f2a3670e44b..5dc1a040a2f1 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -133,10 +133,10 @@ min_adv_mss - INTEGER IP Fragmentation: -ipfrag_high_thresh - INTEGER +ipfrag_high_thresh - LONG INTEGER Maximum memory used to reassemble IP fragments. -ipfrag_low_thresh - INTEGER +ipfrag_low_thresh - LONG INTEGER (Obsolete since linux-4.17) Maximum memory used to reassemble IP fragments before the kernel begins to remove incomplete fragment queues to free up resources. diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 95e353e3305b..a52e7273e7a5 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -8,11 +8,11 @@ struct netns_frags { struct rhashtable rhashtable ____cacheline_aligned_in_smp; /* Keep atomic mem on separate cachelines in structs that include it */ - atomic_t mem ____cacheline_aligned_in_smp; + atomic_long_t mem ____cacheline_aligned_in_smp; /* sysctls */ + long high_thresh; + long low_thresh; int timeout; - int high_thresh; - int low_thresh; int max_dist; struct inet_frags *f; }; @@ -102,7 +102,7 @@ void inet_frags_fini(struct inet_frags *); static inline int inet_frags_init_net(struct netns_frags *nf) { - atomic_set(&nf->mem, 0); + atomic_long_set(&nf->mem, 0); return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); } void inet_frags_exit_net(struct netns_frags *nf); @@ -119,19 +119,19 @@ static inline void inet_frag_put(struct inet_frag_queue *q) /* Memory Tracking Functions. */ -static inline int frag_mem_limit(struct netns_frags *nf) +static inline long frag_mem_limit(const struct netns_frags *nf) { - return atomic_read(&nf->mem); + return atomic_long_read(&nf->mem); } -static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) +static inline void sub_frag_mem_limit(struct netns_frags *nf, long val) { - atomic_sub(i, &nf->mem); + atomic_long_sub(val, &nf->mem); } -static inline void add_frag_mem_limit(struct netns_frags *nf, int i) +static inline void add_frag_mem_limit(struct netns_frags *nf, long val) { - atomic_add(i, &nf->mem); + atomic_long_add(val, &nf->mem); } /* RFC 3168 support : diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 1aec71a3f904..44f148a6bb57 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -411,23 +411,23 @@ err: } #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .data = &init_net.ieee802154_lowpan.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", .data = &init_net.ieee802154_lowpan.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b0366224f314..053869f2c49b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -678,23 +678,23 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .data = &init_net.ipv4.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ipv4.frags.high_thresh }, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index aacfce0d7d82..a058de677e94 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -71,7 +71,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - seq_printf(seq, "FRAG: inuse %u memory %u\n", + seq_printf(seq, "FRAG: inuse %u memory %lu\n", atomic_read(&net->ipv4.frags.rhashtable.nelems), frag_mem_limit(&net->ipv4.frags)); return 0; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d866412b8f6c..603a39592859 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -63,7 +63,7 @@ struct nf_ct_frag6_skb_cb static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { @@ -76,18 +76,18 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_low_thresh", .data = &init_net.nf_frag.frags.low_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", .data = &init_net.nf_frag.frags.high_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.nf_frag.frags.low_thresh }, { } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 8befeb91e071..a85f7e0b14b1 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -47,7 +47,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %u memory %u\n", + seq_printf(seq, "FRAG6: inuse %u memory %lu\n", atomic_read(&net->ipv6.frags.rhashtable.nelems), frag_mem_limit(&net->ipv6.frags)); return 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 2a77fda5e3bc..905a8aee2671 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -552,15 +552,15 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, -- cgit v1.2.3-59-g8ed1b From 1eec5d5670084ee644597bd26c25e22c69b9f748 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:54 -0700 Subject: inet: frags: do not clone skb in ip_expire() An skb_clone() was added in commit ec4fbd64751d ("inet: frag: release spinlock before calling icmp_send()") While fixing the bug at that time, it also added a very high cost for DDOS frags, as the ICMP rate limit is applied after this expensive operation (skb_clone() + consume_skb(), implying memory allocations, copy, and freeing) We can use skb_get(head) here, all we want is to make sure skb wont be freed by another cpu. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 053869f2c49b..fb185d9a5cc7 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -143,8 +143,8 @@ static bool frag_expire_skip_icmp(u32 user) static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); - struct sk_buff *clone, *head; const struct iphdr *iph; + struct sk_buff *head; struct net *net; struct ipq *qp; int err; @@ -187,16 +187,12 @@ static void ip_expire(struct timer_list *t) (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; - clone = skb_clone(head, GFP_ATOMIC); + skb_get(head); + spin_unlock(&qp->q.lock); + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if (clone) { - spin_unlock(&qp->q.lock); - icmp_send(clone, ICMP_TIME_EXCEEDED, - ICMP_EXC_FRAGTIME, 0); - consume_skb(clone); - goto out_rcu_unlock; - } out: spin_unlock(&qp->q.lock); out_rcu_unlock: -- cgit v1.2.3-59-g8ed1b From bf66337140c64c27fa37222b7abca7e49d63fb57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:58 -0700 Subject: inet: frags: get rid of ipfrag_skb_cb/FRAG_CB ip_defrag uses skb->cb[] to store the fragment offset, and unfortunately this integer is currently in a different cache line than skb->next, meaning that we use two cache lines per skb when finding the insertion point. By aliasing skb->ip_defrag_offset and skb->dev, we pack all the fields in a single cache line and save precious memory bandwidth. Note that after the fast path added by Changli Gao in commit d6bebca92c66 ("fragment: add fast path for in-order fragments") this change wont help the fast path, since we still need to access prev->len (2nd cache line), but will show great benefits when slow path is entered, since we perform a linear scan of a potentially long list. Also, note that this potential long list is an attack vector, we might consider also using an rb-tree there eventually. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/ipv4/ip_fragment.c | 35 ++++++++++++++--------------------- 2 files changed, 15 insertions(+), 21 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47082f54ec1f..9065477ed255 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -672,6 +672,7 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; + int ip_defrag_offset; }; }; struct rb_node rbnode; /* used in netem & tcp stack */ diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fb185d9a5cc7..994fa70a910f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,14 +57,6 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; -struct ipfrag_skb_cb -{ - struct inet_skb_parm h; - int offset; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -353,13 +345,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * this fragment, right? */ prev = qp->q.fragments_tail; - if (!prev || FRAG_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = qp->q.fragments; next != NULL; next = next->next) { - if (FRAG_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -370,7 +362,7 @@ found: * any overlaps are eliminated. */ if (prev) { - int i = (FRAG_CB(prev)->offset + prev->len) - offset; + int i = (prev->ip_defrag_offset + prev->len) - offset; if (i > 0) { offset += i; @@ -387,8 +379,8 @@ found: err = -ENOMEM; - while (next && FRAG_CB(next)->offset < end) { - int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + while (next && next->ip_defrag_offset < end) { + int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */ if (i < next->len) { /* Eat head of the next overlapped fragment @@ -396,7 +388,7 @@ found: */ if (!pskb_pull(next, i)) goto err; - FRAG_CB(next)->offset += i; + next->ip_defrag_offset += i; qp->q.meat -= i; if (next->ip_summed != CHECKSUM_UNNECESSARY) next->ip_summed = CHECKSUM_NONE; @@ -420,7 +412,13 @@ found: } } - FRAG_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + dev = skb->dev; + if (dev) + qp->iif = dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -431,11 +429,6 @@ found: else qp->q.fragments = skb; - dev = skb->dev; - if (dev) { - qp->iif = dev->ifindex; - skb->dev = NULL; - } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -511,7 +504,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } WARN_ON(!head); - WARN_ON(FRAG_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ ihlen = ip_hdrlen(head); -- cgit v1.2.3-59-g8ed1b