diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 108 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 29 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 49 | ||||
-rw-r--r-- | net/core/gro.c | 3 | ||||
-rw-r--r-- | net/core/net_namespace.c | 8 | ||||
-rw-r--r-- | net/core/skbuff.c | 110 | ||||
-rw-r--r-- | net/core/sock_map.c | 3 |
7 files changed, 157 insertions, 153 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index ebc000b56828..18064be6cf3e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1122,6 +1122,12 @@ out: return ret; } +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); +} + /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace @@ -1130,7 +1136,7 @@ out: * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -1142,14 +1148,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev_addr_cmp(dev, type, ha)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -2071,6 +2102,42 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } +static void rtnl_net_dev_lock(struct net_device *dev) +{ + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); +} + int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) @@ -2078,6 +2145,11 @@ int register_netdevice_notifier_dev_net(struct net_device *dev, struct net *net = dev_net(dev); int err; + /* rtnl_net_lock() assumes dev is not yet published by + * register_netdevice(). + */ + DEBUG_NET_WARN_ON_ONCE(!list_empty(&dev->dev_list)); + rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); if (!err) { @@ -2094,13 +2166,12 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { - struct net *net = dev_net(dev); int err; - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); list_del(&nn->list); - err = __unregister_netdevice_notifier_net(net, nb); - rtnl_net_unlock(net); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_net_dev_unlock(dev); return err; } @@ -6925,6 +6996,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), @@ -11901,11 +11989,9 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - struct net *net = dev_net(dev); - - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_net_unlock(net); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 6efd4cccc9dd..212f0a048cab 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1734,30 +1734,30 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family(&net_drop_monitor_family); - if (rc) { - pr_err("Could not create drop monitor netlink family\n"); - return rc; + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } - WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); + return rc; + } + + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); goto out_unreg; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = 0; - for_each_possible_cpu(cpu) { - net_dm_cpu_data_init(cpu); - net_dm_hw_cpu_data_init(cpu); - } - goto out; out_unreg: - genl_unregister_family(&net_drop_monitor_family); + WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } @@ -1766,19 +1766,18 @@ static void exit_net_drop_monitor(void) { int cpu; - BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); - /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } - - BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5db41bf2ed93..9cd8de6bebb5 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -853,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { - enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; - struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS; - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); - if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + + if (!key_ports && !key_ports_range) return; - key_ports = skb_flow_dissector_target(flow_dissector, - dissector_ports, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); + + if (key_ports) + key_ports->ports = ports; + + if (key_ports_range) + key_ports_range->tp.ports = ports; } static void @@ -924,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -968,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE, - target_container); - - if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) { + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + key_ports_range->tp.src = flow_keys->sport; + key_ports_range->tp.dst = flow_keys->dport; + } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { diff --git a/net/core/gro.c b/net/core/gro.c index d1f44084e978..78b320b63174 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -7,9 +7,6 @@ #define MAX_GRO_SKBS 8 -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(offload_lock); /** diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb39a12b2f82..4303f2a49262 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -464,7 +464,7 @@ static void net_complete_free(void) } -static void net_free(struct net *net) +void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); @@ -482,7 +482,7 @@ void net_drop_ns(void *p) struct net *net = (struct net *)p; if (net) - net_free(net); + net_passive_dec(net); } struct net *copy_net_ns(unsigned long flags, @@ -523,7 +523,7 @@ put_userns: key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); - net_free(net); + net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -672,7 +672,7 @@ static void cleanup_net(struct work_struct *work) key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); - net_free(net); + net_passive_dec(net); } cleanup_net_task = NULL; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a441613a1e6c..7b03b64fdcb2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -69,6 +69,7 @@ #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/gro.h> #include <net/gso.h> #include <net/hotdata.h> #include <net/ip6_checksum.h> @@ -95,7 +96,9 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) +#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ + GRO_MAX_HEAD_PAD)) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique @@ -220,67 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif - struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -290,23 +235,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -736,7 +664,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); @@ -813,10 +741,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -826,32 +752,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } + len = SKB_HEAD_ALIGN(len); + if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len = SKB_HEAD_ALIGN(len); - - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f1b9b3958792..2f1be9baad05 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -541,6 +541,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; + if (sk_is_vsock(sk) && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } |