diff options
Diffstat (limited to 'net/core')
44 files changed, 7782 insertions, 3538 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 4268846f2f47..5857cec87b83 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -4,14 +4,17 @@ # obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ - gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o + gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ + flow_dissector.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o xdp.o flow_offload.o + fib_notifier.o xdp.o flow_offload.o gro.o + +obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 68d2cbf8331a..94374d529ea4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -13,6 +13,7 @@ #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(sk_cache); @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) struct bpf_local_storage *sk_storage; struct bpf_local_storage_map *smap; - sk_storage = rcu_dereference(sk->sk_bpf_storage); + sk_storage = + rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held()); if (!sk_storage) return NULL; @@ -38,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata)); + bpf_selem_unlink(SELEM(sdata), true); return 0; } @@ -73,8 +75,8 @@ void bpf_sk_storage_free(struct sock *sk) * sk_storage. */ bpf_selem_unlink_map(selem); - free_sk_storage = bpf_selem_unlink_storage_nolock(sk_storage, - selem, true); + free_sk_storage = bpf_selem_unlink_storage_nolock( + sk_storage, selem, true, false); } raw_spin_unlock_bh(&sk_storage->lock); rcu_read_unlock(); @@ -139,7 +141,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, - map_flags); + map_flags, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -170,7 +172,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -228,7 +230,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) bpf_selem_link_map(smap, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { - ret = bpf_local_storage_alloc(newsk, smap, copy_selem); + ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { kfree(copy_selem); atomic_sub(smap->elem_size, @@ -253,11 +255,13 @@ out: return ret; } -BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; @@ -274,7 +278,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -288,6 +292,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk)) return -EINVAL; @@ -305,11 +310,12 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { + int optmem_max = READ_ONCE(sysctl_optmem_max); struct sock *sk = (struct sock *)owner; /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + if (size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { atomic_add(size, &sk->sk_omem_alloc); return 0; } @@ -333,7 +339,7 @@ bpf_sk_storage_ptr(void *owner) return &sk->sk_bpf_storage; } -static int sk_storage_map_btf_id; +BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -344,8 +350,7 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_name = "bpf_local_storage_map", - .map_btf_id = &sk_storage_map_btf_id, + .map_btf_id = &sk_storage_map_btf_ids[0], .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, @@ -401,6 +406,8 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: btf_vmlinux = bpf_get_btf_vmlinux(); + if (IS_ERR_OR_NULL(btf_vmlinux)) + return false; btf_id = prog->aux->attach_btf_id; t = btf_type_by_id(btf_vmlinux, btf_id); tname = btf_name_by_offset(btf_vmlinux, t->name_off); @@ -413,18 +420,22 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return false; } -BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return (unsigned long)NULL; - return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags, + gfp_flags); } BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, struct sock *, sk) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return -EPERM; @@ -865,10 +876,18 @@ static int bpf_iter_init_sk_storage_map(void *priv_data, { struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + bpf_map_inc_with_uref(aux->map); seq_info->map = aux->map; return 0; } +static void bpf_iter_fini_sk_storage_map(void *priv_data) +{ + struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data; + + bpf_map_put_with_uref(seq_info->map); +} + static int bpf_iter_attach_map(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux) @@ -886,7 +905,7 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto put_map; - if (prog->aux->max_rdonly_access > map->value_size) { + if (prog->aux->max_rdwr_access > map->value_size) { err = -EACCES; goto put_map; } @@ -914,7 +933,7 @@ static const struct seq_operations bpf_sk_storage_map_seq_ops = { static const struct bpf_iter_seq_info iter_seq_info = { .seq_ops = &bpf_sk_storage_map_seq_ops, .init_seq_private = bpf_iter_init_sk_storage_map, - .fini_seq_private = NULL, + .fini_seq_private = bpf_iter_fini_sk_storage_map, .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info), }; @@ -929,7 +948,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; diff --git a/net/core/datagram.c b/net/core/datagram.c index ee290776c661..e4ff2db40c98 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -62,8 +62,6 @@ #include <trace/events/skb.h> #include <net/busy_poll.h> -#include "datagram.h" - /* * Is a socket 'connection oriented' ? */ @@ -310,12 +308,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, EXPORT_SYMBOL(__skb_recv_datagram); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, - int noblock, int *err) + int *err) { int off = 0; - return __skb_recv_datagram(sk, &sk->sk_receive_queue, - flags | (noblock ? MSG_DONTWAIT : 0), + return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags, &off, err); } EXPORT_SYMBOL(skb_recv_datagram); @@ -323,7 +320,6 @@ EXPORT_SYMBOL(skb_recv_datagram); void skb_free_datagram(struct sock *sk, struct sk_buff *skb) { consume_skb(skb); - sk_mem_reclaim_partial(sk); } EXPORT_SYMBOL(skb_free_datagram); @@ -339,7 +335,6 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) slow = lock_sock_fast(sk); sk_peek_offset_bwd(sk, len); skb_orphan(skb); - sk_mem_reclaim_partial(sk); unlock_sock_fast(sk, slow); /* skb is now orphaned, can be freed outside of locked section */ @@ -399,7 +394,6 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) NULL); kfree_skb(skb); - sk_mem_reclaim_partial(sk); return err; } EXPORT_SYMBOL(skb_kill_datagram); @@ -616,10 +610,16 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); -int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, - struct iov_iter *from, size_t length) +int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, struct iov_iter *from, + size_t length) { - int frag = skb_shinfo(skb)->nr_frags; + int frag; + + if (msg && msg->msg_ubuf && msg->sg_from_iter) + return msg->sg_from_iter(sk, skb, from, length); + + frag = skb_shinfo(skb)->nr_frags; while (length && iov_iter_count(from)) { struct page *pages[MAX_SKB_FRAGS]; @@ -632,12 +632,11 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; - copied = iov_iter_get_pages(from, pages, length, + copied = iov_iter_get_pages2(from, pages, length, MAX_SKB_FRAGS - frag, &start); if (copied < 0) return -EFAULT; - iov_iter_advance(from, copied); length -= copied; truesize = PAGE_ALIGN(copied + start); @@ -678,7 +677,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, page_ref_sub(last_head, refs); refs = 0; } - skb_fill_page_desc(skb, frag++, head, start, size); + skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); @@ -705,7 +704,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/datagram.h b/net/core/datagram.h deleted file mode 100644 index bcfb75bfa3b2..000000000000 --- a/net/core/datagram.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _NET_CORE_DATAGRAM_H_ -#define _NET_CORE_DATAGRAM_H_ - -#include <linux/types.h> - -struct sock; -struct sk_buff; -struct iov_iter; - -int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, - struct iov_iter *from, size_t length); - -#endif /* _NET_CORE_DATAGRAM_H_ */ diff --git a/net/core/dev.c b/net/core/dev.c index 2a352e668d10..3be256051e99 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -151,18 +151,13 @@ #include <linux/prandom.h> #include <linux/once_lite.h> +#include "dev.h" #include "net-sysfs.h" -#define MAX_GRO_SKBS 8 - -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(ptype_lock); -static DEFINE_SPINLOCK(offload_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ -static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, @@ -222,18 +217,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } -static inline void rps_lock(struct softnet_data *sd) +static inline void rps_lock_irqsave(struct softnet_data *sd, + unsigned long *flags) { -#ifdef CONFIG_RPS - spin_lock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(*flags); } -static inline void rps_unlock(struct softnet_data *sd) +static inline void rps_lock_irq_disable(struct softnet_data *sd) { -#ifdef CONFIG_RPS - spin_unlock(&sd->input_pkt_queue.lock); -#endif + if (IS_ENABLED(CONFIG_RPS)) + spin_lock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); +} + +static inline void rps_unlock_irq_restore(struct softnet_data *sd, + unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static inline void rps_unlock_irq_enable(struct softnet_data *sd) +{ + if (IS_ENABLED(CONFIG_RPS)) + spin_unlock_irq(&sd->input_pkt_queue.lock); + else if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); } static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, @@ -326,7 +341,6 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name) return 0; } -EXPORT_SYMBOL(netdev_name_node_alt_create); static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { @@ -354,7 +368,6 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) return 0; } -EXPORT_SYMBOL(netdev_name_node_alt_destroy); static void netdev_name_node_alt_flush(struct net_device *dev) { @@ -371,12 +384,12 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(net); } @@ -384,16 +397,18 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev) +static void unlist_netdevice(struct net_device *dev, bool lock) { ASSERT_RTNL(); /* Unlink dev from the device chain */ - write_lock_bh(&dev_base_lock); + if (lock) + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock_bh(&dev_base_lock); + if (lock) + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -604,84 +619,6 @@ void dev_remove_pack(struct packet_type *pt) EXPORT_SYMBOL(dev_remove_pack); -/** - * dev_add_offload - register offload handlers - * @po: protocol offload declaration - * - * Add protocol offload handlers to the networking stack. The passed - * &proto_offload is linked into kernel lists and may not be freed until - * it has been removed from the kernel lists. - * - * This call does not sleep therefore it can not - * guarantee all CPU's that are in middle of receiving packets - * will see the new offload handlers (until the next received packet). - */ -void dev_add_offload(struct packet_offload *po) -{ - struct packet_offload *elem; - - spin_lock(&offload_lock); - list_for_each_entry(elem, &offload_base, list) { - if (po->priority < elem->priority) - break; - } - list_add_rcu(&po->list, elem->list.prev); - spin_unlock(&offload_lock); -} -EXPORT_SYMBOL(dev_add_offload); - -/** - * __dev_remove_offload - remove offload handler - * @po: packet offload declaration - * - * Remove a protocol offload handler that was previously added to the - * kernel offload handlers by dev_add_offload(). The passed &offload_type - * is removed from the kernel lists and can be freed or reused once this - * function returns. - * - * The packet type might still be in use by receivers - * and must not be freed until after all the CPU's have gone - * through a quiescent state. - */ -static void __dev_remove_offload(struct packet_offload *po) -{ - struct list_head *head = &offload_base; - struct packet_offload *po1; - - spin_lock(&offload_lock); - - list_for_each_entry(po1, head, list) { - if (po == po1) { - list_del_rcu(&po->list); - goto out; - } - } - - pr_warn("dev_remove_offload: %p not found\n", po); -out: - spin_unlock(&offload_lock); -} - -/** - * dev_remove_offload - remove packet offload handler - * @po: packet offload declaration - * - * Remove a packet offload handler that was previously added to the kernel - * offload handlers by dev_add_offload(). The passed &offload_type is - * removed from the kernel lists and can be freed or reused once this - * function returns. - * - * This call sleeps to guarantee that no CPU is looking at the packet - * type after return. - */ -void dev_remove_offload(struct packet_offload *po) -{ - __dev_remove_offload(po); - - synchronize_net(); -} -EXPORT_SYMBOL(dev_remove_offload); - /******************************************************************************* * * Device Interface Subroutines @@ -747,11 +684,11 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, - .daddr = daddr, }; struct net_device_path *path; int ret = 0; + memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; @@ -767,6 +704,10 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, if (WARN_ON_ONCE(last_dev == ctx.dev)) return -1; } + + if (!ctx.dev) + return ret; + path = dev_fwd_path(stack); if (!path) return -1; @@ -1121,7 +1062,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, name_node->name, IFNAMSIZ)) - set_bit(i, inuse); + __set_bit(i, inuse); } if (!sscanf(d->name, name, &i)) continue; @@ -1131,7 +1072,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) - set_bit(i, inuse); + __set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); @@ -1159,7 +1100,7 @@ static int dev_alloc_name_ns(struct net *net, BUG_ON(!net); ret = __dev_alloc_name(net, name, buf); if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); + strscpy(dev->name, buf, IFNAMSIZ); return ret; } @@ -1196,7 +1137,7 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev, else if (netdev_name_in_use(net, name)) return -EEXIST; else if (dev->name != name) - strlcpy(dev->name, name, IFNAMSIZ); + strscpy(dev->name, name, IFNAMSIZ); return 0; } @@ -1272,15 +1213,15 @@ rollback: netdev_adjacent_rename_links(dev, oldname); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); synchronize_rcu(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @@ -1461,6 +1402,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) int ret; ASSERT_RTNL(); + dev_addr_check(dev); if (!netif_device_present(dev)) { /* may be detached because parent is runtime-suspended */ @@ -1685,7 +1627,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) - N(PRE_CHANGEADDR) + N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) + N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -2002,6 +1945,32 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +/** + * call_netdevice_notifiers_info_robust - call per-netns notifier blocks + * for and rollback on error + * @val_up: value passed unmodified to notifier function + * @val_down: value passed unmodified to the notifier function when + * recovering from an error on @val_up + * @info: notifier information data + * + * Call all per-netns network notifier blocks, but not notifier blocks on + * the global notifier chain. Parameters and return value are as for + * raw_notifier_call_chain_robust(). + */ + +static int +call_netdevice_notifiers_info_robust(unsigned long val_up, + unsigned long val_down, + struct netdev_notifier_info *info) +{ + struct net *net = dev_net(info->dev); + + ASSERT_RTNL(); + + return raw_notifier_call_chain_robust(&net->netdev_chain, + val_up, val_down, info); +} + static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack) @@ -2083,7 +2052,8 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif -static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; @@ -2144,14 +2114,15 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; + skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) - __net_timestamp(skb); + skb->tstamp = ktime_get_real(); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ - __net_timestamp(SKB); \ + (SKB)->tstamp = ktime_get_real(); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) @@ -3024,15 +2995,73 @@ undo_rx: EXPORT_SYMBOL(netif_set_real_num_queues); /** + * netif_set_tso_max_size() - set the max size of TSO frames supported + * @dev: netdev to update + * @size: max skb->len of a TSO frame + * + * Set the limit on the size of TSO super-frames the device can handle. + * Unless explicitly set the stack will assume the value of + * %GSO_LEGACY_MAX_SIZE. + */ +void netif_set_tso_max_size(struct net_device *dev, unsigned int size) +{ + dev->tso_max_size = min(GSO_MAX_SIZE, size); + if (size < READ_ONCE(dev->gso_max_size)) + netif_set_gso_max_size(dev, size); +} +EXPORT_SYMBOL(netif_set_tso_max_size); + +/** + * netif_set_tso_max_segs() - set the max number of segs supported for TSO + * @dev: netdev to update + * @segs: max number of TCP segments + * + * Set the limit on the number of TCP segments the device can generate from + * a single TSO super-frame. + * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. + */ +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) +{ + dev->tso_max_segs = segs; + if (segs < READ_ONCE(dev->gso_max_segs)) + netif_set_gso_max_segs(dev, segs); +} +EXPORT_SYMBOL(netif_set_tso_max_segs); + +/** + * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper + * @to: netdev to update + * @from: netdev from which to copy the limits + */ +void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) +{ + netif_set_tso_max_size(to, from->tso_max_size); + netif_set_tso_max_segs(to, from->tso_max_segs); +} +EXPORT_SYMBOL(netif_inherit_tso_max); + +/** * netif_get_num_default_rss_queues - default number of RSS queues * - * This routine should set an upper limit on the number of RSS queues - * used by default by multiqueue devices. + * Default value is the number of physical cores if there are only 1 or 2, or + * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { - return is_kdump_kernel() ? - 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); + cpumask_var_t cpus; + int cpu, count = 0; + + if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) + return 1; + + cpumask_copy(cpus, cpu_online_mask); + for_each_cpu(cpu, cpus) { + ++count; + cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); + } + free_cpumask_var(cpus); + + return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); @@ -3239,12 +3268,18 @@ int skb_checksum_help(struct sk_buff *skb) } offset = skb_checksum_start_offset(skb); - BUG_ON(offset >= skb_headlen(skb)); + ret = -EINVAL; + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + goto out; + } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; - BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - + if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + goto out; + } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); if (ret) goto out; @@ -3315,40 +3350,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) return __vlan_get_protocol(skb, type, depth); } -/** - * skb_mac_gso_segment - mac layer segmentation handler. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - int vlan_depth = skb->mac_len; - __be16 type = skb_network_protocol(skb, &vlan_depth); - - if (unlikely(!type)) - return ERR_PTR(-EINVAL); - - __skb_pull(skb, vlan_depth); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; -} -EXPORT_SYMBOL(skb_mac_gso_segment); - - /* openvswitch calls this on rx path, so we need a different check. */ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) @@ -3513,7 +3514,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, { u16 gso_segs = skb_shinfo(skb)->gso_segs; - if (gso_segs > dev->gso_max_segs) + if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { @@ -3585,7 +3586,6 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, dev_queue_xmit_nit(skb, dev); len = skb->len; - PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies); trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); @@ -3703,7 +3703,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device out_kfree_skb: kfree_skb(skb); out_null: - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); return NULL; } @@ -3827,7 +3827,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, no_lock_out: if (unlikely(to_free)) - kfree_skb_list(to_free); + kfree_skb_list_reason(to_free, + SKB_DROP_REASON_QDISC_DROP); return rc; } @@ -3836,8 +3837,12 @@ no_lock_out: * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. + * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit + * and then other tasks will only enqueue packets. The packets will be + * sent after the qdisc owner is scheduled again. To prevent this + * scenario the task always serialize on the lock. */ - contended = qdisc_is_running(q); + contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); if (unlikely(contended)) spin_lock(&q->busylock); @@ -3878,7 +3883,7 @@ no_lock_out: } spin_unlock(root_lock); if (unlikely(to_free)) - kfree_skb_list(to_free); + kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; @@ -3922,9 +3927,9 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_LOOPBACK; if (skb->ip_summed == CHECKSUM_NONE) skb->ip_summed = CHECKSUM_UNNECESSARY; - WARN_ON(!skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb)); skb_dst_force(skb); - netif_rx_ni(skb); + netif_rx(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); @@ -3941,8 +3946,8 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ - qdisc_skb_cb(skb)->mru = 0; - qdisc_skb_cb(skb)->post_ct = false; + tc_skb_cb(skb)->mru = 0; + tc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { @@ -3953,7 +3958,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); *ret = NET_XMIT_DROP; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: @@ -3973,6 +3978,25 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; } + +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) +{ + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_XPS @@ -4115,40 +4139,36 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, } /** - * __dev_queue_xmit - transmit a buffer - * @skb: buffer to transmit - * @sb_dev: suboordinate device used for L2 forwarding offload - * - * Queue a buffer for transmission to a network device. The caller must - * have set the device and priority and built the buffer before calling - * this function. The function can be called from an interrupt. + * __dev_queue_xmit() - transmit a buffer + * @skb: buffer to transmit + * @sb_dev: suboordinate device used for L2 forwarding offload * - * A negative errno code is returned on a failure. A success does not - * guarantee the frame will be transmitted as it may be dropped due - * to congestion or traffic shaping. + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. * - * ----------------------------------------------------------------------------------- - * I notice this method can also return errors from the queue disciplines, - * including NET_XMIT_DROP, which is a positive value. So, errors can also - * be positive. + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. * - * Regardless of the return value, the skb is consumed, so it is currently - * difficult to retry a send to this method. (You can bump the ref count - * before sending to hold a reference for retry if you are careful.) + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) * - * When calling this method, interrupts MUST be enabled. This is because - * the BH enable code must have IRQs enabled so that it will not deadlock. - * --BLG + * Return: + * * 0 - buffer successfully transmitted + * * positive qdisc return code - NET_XMIT_DROP etc. + * * negative errno - other errors */ -static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; - struct netdev_queue *txq; + struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; bool again = false; skb_reset_mac_header(skb); + skb_assert_len(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); @@ -4171,11 +4191,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (!skb) goto out; } + + netdev_xmit_skip_txqueue(false); + nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; nf_skip_egress(skb, false); + + if (netdev_xmit_txqueue_skipped()) + txq = netdev_tx_queue_mapping(dev, skb); } #endif /* If device/qdisc don't need skb->dst, release it right now while @@ -4186,7 +4212,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) else skb_dst_force(skb); - txq = netdev_core_pick_tx(dev, skb, sb_dev); + if (!txq) + txq = netdev_core_pick_tx(dev, skb, sb_dev); + q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); @@ -4221,7 +4249,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (!skb) goto out; - PRANDOM_ADD_NOISE(skb, dev, txq, jiffies); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { @@ -4249,25 +4276,14 @@ recursion_alert: rc = -ENETDOWN; rcu_read_unlock_bh(); - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } - -int dev_queue_xmit(struct sk_buff *skb) -{ - return __dev_queue_xmit(skb, NULL); -} -EXPORT_SYMBOL(dev_queue_xmit); - -int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) -{ - return __dev_queue_xmit(skb, sb_dev); -} -EXPORT_SYMBOL(dev_queue_xmit_accel); +EXPORT_SYMBOL(__dev_queue_xmit); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { @@ -4287,7 +4303,6 @@ int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); - PRANDOM_ADD_NOISE(skb, dev, txq, jiffies); local_bh_disable(); @@ -4301,7 +4316,7 @@ int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) local_bh_enable(); return ret; drop: - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return NET_XMIT_DROP; } @@ -4315,6 +4330,7 @@ int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; +unsigned int sysctl_skb_defer_max __read_mostly = 64; int netdev_budget __read_mostly = 300; /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; @@ -4323,8 +4339,6 @@ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; int dev_tx_weight __read_mostly = 64; -/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ -int gro_normal_batch __read_mostly = 8; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -4332,6 +4346,8 @@ static inline void ____napi_schedule(struct softnet_data *sd, { struct task_struct *thread; + lockdep_assert_irqs_disabled(); + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in * napi_enable()/dev_set_threaded(). @@ -4566,16 +4582,25 @@ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ +/* Called from hardirq (IPI) context */ +static void trigger_rx_softirq(void *data) +{ + struct softnet_data *sd = data; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); +} + /* * Check if this softnet_data structure is another cpu one * If yes, queue it to our IPI list and return 1 * If no, return 0 */ -static int rps_ipi_queued(struct softnet_data *sd) +static int napi_schedule_rps(struct softnet_data *sd) { -#ifdef CONFIG_RPS struct softnet_data *mysd = this_cpu_ptr(&softnet_data); +#ifdef CONFIG_RPS if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; @@ -4584,6 +4609,7 @@ static int rps_ipi_queued(struct softnet_data *sd) return 1; } #endif /* CONFIG_RPS */ + __napi_schedule_irqoff(&mysd->backlog); return 0; } @@ -4598,7 +4624,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) struct softnet_data *sd; unsigned int old_flow, new_flow; - if (qlen < (netdev_max_backlog >> 1)) + if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); @@ -4634,46 +4660,42 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { + enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; + reason = SKB_DROP_REASON_NOT_SPECIFIED; sd = &per_cpu(softnet_data, cpu); - local_irq_save(flags); - - rps_lock(sd); + rps_lock_irqsave(sd, &flags); if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { + if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); - rps_unlock(sd); - local_irq_restore(flags); + rps_unlock_irq_restore(sd, &flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device * We can use non atomic operation since we own the queue lock */ - if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { - if (!rps_ipi_queued(sd)) - ____napi_schedule(sd, &sd->backlog); - } + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) + napi_schedule_rps(sd); goto enqueue; } + reason = SKB_DROP_REASON_CPU_BACKLOG; drop: sd->dropped++; - rps_unlock(sd); + rps_unlock_irq_restore(sd, &flags); - local_irq_restore(flags); - - atomic_long_inc(&skb->dev->rx_dropped); - kfree_skb(skb); + dev_core_stats_rx_dropped_inc(skb->dev); + kfree_skb_reason(skb, reason); return NET_RX_DROP; } @@ -4827,7 +4849,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(skb->dev, xdp_prog, act); @@ -4842,7 +4864,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, } /* When doing generic XDP we have to bypass the qdisc layer and the - * network taps in order to match in-driver-XDP behavior. + * network taps in order to match in-driver-XDP behavior. This also means + * that XDP packets are able to starve other packets going through a qdisc, + * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX + * queues, so they do not have this starvation issue. */ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) { @@ -4854,7 +4879,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) txq = netdev_core_pick_tx(dev, skb, NULL); cpu = smp_processor_id(); HARD_TX_LOCK(dev, txq, cpu); - if (!netif_xmit_stopped(txq)) { + if (!netif_xmit_frozen_or_drv_stopped(txq)) { rc = netdev_start_xmit(skb, dev, txq, 0); if (dev_xmit_complete(rc)) free_skb = false; @@ -4862,6 +4887,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) HARD_TX_UNLOCK(dev, txq); if (free_skb) { trace_xdp_exception(dev, xdp_prog, XDP_TX); + dev_core_stats_tx_dropped_inc(dev); kfree_skb(skb); } } @@ -4893,7 +4919,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); @@ -4902,7 +4928,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_rx(skb); @@ -4911,7 +4937,6 @@ static int netif_rx_internal(struct sk_buff *skb) struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; - preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -4921,78 +4946,72 @@ static int netif_rx_internal(struct sk_buff *skb) ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - preempt_enable(); } else #endif { unsigned int qtail; - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); - put_cpu(); + ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail); } return ret; } /** + * __netif_rx - Slightly optimized version of netif_rx + * @skb: buffer to post + * + * This behaves as netif_rx except that it does not disable bottom halves. + * As a result this function may only be invoked from the interrupt context + * (either hard or soft interrupt). + */ +int __netif_rx(struct sk_buff *skb) +{ + int ret; + + lockdep_assert_once(hardirq_count() | softirq_count()); + + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + return ret; +} +EXPORT_SYMBOL(__netif_rx); + +/** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. + * the upper (protocol) levels to process via the backlog NAPI device. It + * always succeeds. The buffer may be dropped during processing for + * congestion control or by the protocol layers. + * The network buffer is passed via the backlog NAPI device. Modern NIC + * driver should use NAPI and GRO. + * This function can used from interrupt and from process context. The + * caller from process context must not disable interrupts before invoking + * this function. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ - int netif_rx(struct sk_buff *skb) { + bool need_bh_off = !(hardirq_count() | softirq_count()); int ret; + if (need_bh_off) + local_bh_disable(); trace_netif_rx_entry(skb); - ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); - + if (need_bh_off) + local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); -int netif_rx_ni(struct sk_buff *skb) -{ - int err; - - trace_netif_rx_ni_entry(skb); - - preempt_disable(); - err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); - trace_netif_rx_ni_exit(err); - - return err; -} -EXPORT_SYMBOL(netif_rx_ni); - -int netif_rx_any_context(struct sk_buff *skb) -{ - /* - * If invoked from contexts which do not invoke bottom half - * processing either at return from interrupt or when softrqs are - * reenabled, use netif_rx_ni() which invokes bottomhalf processing - * directly. - */ - if (in_interrupt()) - return netif_rx(skb); - else - return netif_rx_ni(skb); -} -EXPORT_SYMBOL(netif_rx_any_context); - static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5014,7 +5033,8 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) trace_consume_skb(skb); else - trace_kfree_skb(skb, net_tx_action); + trace_kfree_skb(skb, net_tx_action, + SKB_DROP_REASON_NOT_SPECIFIED); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); @@ -5103,8 +5123,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; - qdisc_skb_cb(skb)->mru = 0; - qdisc_skb_cb(skb)->post_ct = false; + tc_skb_cb(skb)->mru = 0; + tc_skb_cb(skb)->post_ct = false; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); @@ -5115,12 +5135,14 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, break; case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); + *ret = NET_RX_SUCCESS; return NULL; case TC_ACT_REDIRECT: /* skb_mac_header check was done by cls/act_bpf, so @@ -5133,8 +5155,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, *another = true; break; } + *ret = NET_RX_SUCCESS; return NULL; case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; return NULL; default: break; @@ -5261,7 +5285,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!netdev_tstamp_prequeue, skb); + net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_receive_skb(skb); @@ -5433,10 +5457,10 @@ check_vlan_id: } else { drop: if (!deliver_exact) - atomic_long_inc(&skb->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(skb->dev); else - atomic_long_inc(&skb->dev->rx_nohandler); - kfree_skb(skb); + dev_core_stats_rx_nohandler_inc(skb->dev); + kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ @@ -5644,7 +5668,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5667,14 +5691,14 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } -static void netif_receive_skb_list_internal(struct list_head *head) +void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); @@ -5764,8 +5788,7 @@ static void flush_backlog(struct work_struct *work) local_bh_disable(); sd = this_cpu_ptr(&softnet_data); - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); @@ -5773,8 +5796,7 @@ static void flush_backlog(struct work_struct *work) input_queue_head_incr(sd); } } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { @@ -5792,16 +5814,14 @@ static bool flush_required(int cpu) struct softnet_data *sd = &per_cpu(softnet_data, cpu); bool do_flush; - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); /* as insertion into process_queue happens with the rps lock held, * process_queue access may race only with dequeue */ do_flush = !skb_queue_empty(&sd->input_pkt_queue) || !skb_queue_empty_lockless(&sd->process_queue); - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); return do_flush; #endif @@ -5845,550 +5865,6 @@ static void flush_all_backlogs(void) cpus_read_unlock(); } -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) -{ - list_add_tail(&skb->list, &napi->rx_list); - napi->rx_count += segs; - if (napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - -static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) -{ - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int err = -ENOENT; - - BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); - - if (NAPI_GRO_CB(skb)->count == 1) { - skb_shinfo(skb)->gso_size = 0; - goto out; - } - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_complete) - continue; - - err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, - ipv6_gro_complete, inet_gro_complete, - skb, 0); - break; - } - rcu_read_unlock(); - - if (err) { - WARN_ON(&ptype->list == head); - kfree_skb(skb); - return; - } - -out: - gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); -} - -static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, - bool flush_old) -{ - struct list_head *head = &napi->gro_hash[index].list; - struct sk_buff *skb, *p; - - list_for_each_entry_safe_reverse(skb, p, head, list) { - if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) - return; - skb_list_del_init(skb); - napi_gro_complete(napi, skb); - napi->gro_hash[index].count--; - } - - if (!napi->gro_hash[index].count) - __clear_bit(index, &napi->gro_bitmask); -} - -/* napi->gro_hash[].list contains packets ordered by age. - * youngest packets at the head of it. - * Complete skbs in reverse order to reduce latencies. - */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) -{ - unsigned long bitmask = napi->gro_bitmask; - unsigned int i, base = ~0U; - - while ((i = ffs(bitmask)) != 0) { - bitmask >>= i; - base += i; - __napi_gro_flush_chain(napi, base, flush_old); - } -} -EXPORT_SYMBOL(napi_gro_flush); - -static void gro_list_prepare(const struct list_head *head, - const struct sk_buff *skb) -{ - unsigned int maclen = skb->dev->hard_header_len; - u32 hash = skb_get_hash_raw(skb); - struct sk_buff *p; - - list_for_each_entry(p, head, list) { - unsigned long diffs; - - NAPI_GRO_CB(p)->flush = 0; - - if (hash != skb_get_hash_raw(p)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - - diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); - if (skb_vlan_tag_present(p)) - diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); - diffs |= skb_metadata_differs(p, skb); - if (maclen == ETH_HLEN) - diffs |= compare_ether_header(skb_mac_header(p), - skb_mac_header(skb)); - else if (!diffs) - diffs = memcmp(skb_mac_header(p), - skb_mac_header(skb), - maclen); - - /* in most common scenarions 'slow_gro' is 0 - * otherwise we are already on some slower paths - * either skip all the infrequent tests altogether or - * avoid trying too hard to skip each of them individually - */ - if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - struct tc_skb_ext *skb_ext; - struct tc_skb_ext *p_ext; -#endif - - diffs |= p->sk != skb->sk; - diffs |= skb_metadata_dst_cmp(p, skb); - diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); - -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext = skb_ext_find(skb, TC_SKB_EXT); - p_ext = skb_ext_find(p, TC_SKB_EXT); - - diffs |= (!!p_ext) ^ (!!skb_ext); - if (!diffs && unlikely(skb_ext)) - diffs |= p_ext->chain ^ skb_ext->chain; -#endif - } - - NAPI_GRO_CB(p)->same_flow = !diffs; - } -} - -static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) -{ - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; - - NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; - - if (!skb_headlen(skb) && pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0)) && - (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { - NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, - skb_frag_size(frag0), - skb->end - skb->tail); - } -} - -static void gro_pull_from_frag0(struct sk_buff *skb, int grow) -{ - struct skb_shared_info *pinfo = skb_shinfo(skb); - - BUG_ON(skb->end - skb->tail < grow); - - memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - - skb->data_len -= grow; - skb->tail += grow; - - skb_frag_off_add(&pinfo->frags[0], grow); - skb_frag_size_sub(&pinfo->frags[0], grow); - - if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { - skb_frag_unref(skb, 0); - memmove(pinfo->frags, pinfo->frags + 1, - --pinfo->nr_frags * sizeof(pinfo->frags[0])); - } -} - -static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) -{ - struct sk_buff *oldest; - - oldest = list_last_entry(head, struct sk_buff, list); - - /* We are called with head length >= MAX_GRO_SKBS, so this is - * impossible. - */ - if (WARN_ON_ONCE(!oldest)) - return; - - /* Do not adjust napi->gro_hash[].count, caller is adding a new - * SKB to the chain. - */ - skb_list_del_init(oldest); - napi_gro_complete(napi, oldest); -} - -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[bucket]; - struct list_head *head = &offload_base; - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct sk_buff *pp = NULL; - enum gro_result ret; - int same_flow; - int grow; - - if (netif_elide_gro(skb->dev)) - goto normal; - - gro_list_prepare(&gro_list->list, skb); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_receive) - continue; - - skb_set_network_header(skb, skb_gro_offset(skb)); - skb_reset_mac_len(skb); - NAPI_GRO_CB(skb)->same_flow = 0; - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->free = 0; - NAPI_GRO_CB(skb)->encap_mark = 0; - NAPI_GRO_CB(skb)->recursion_counter = 0; - NAPI_GRO_CB(skb)->is_fou = 0; - NAPI_GRO_CB(skb)->is_atomic = 1; - NAPI_GRO_CB(skb)->gro_remcsum_start = 0; - - /* Setup for GRO checksum validation */ - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - NAPI_GRO_CB(skb)->csum = skb->csum; - NAPI_GRO_CB(skb)->csum_valid = 1; - NAPI_GRO_CB(skb)->csum_cnt = 0; - break; - case CHECKSUM_UNNECESSARY: - NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; - NAPI_GRO_CB(skb)->csum_valid = 0; - break; - default: - NAPI_GRO_CB(skb)->csum_cnt = 0; - NAPI_GRO_CB(skb)->csum_valid = 0; - } - - pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, - ipv6_gro_receive, inet_gro_receive, - &gro_list->list, skb); - break; - } - rcu_read_unlock(); - - if (&ptype->list == head) - goto normal; - - if (PTR_ERR(pp) == -EINPROGRESS) { - ret = GRO_CONSUMED; - goto ok; - } - - same_flow = NAPI_GRO_CB(skb)->same_flow; - ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; - - if (pp) { - skb_list_del_init(pp); - napi_gro_complete(napi, pp); - gro_list->count--; - } - - if (same_flow) - goto ok; - - if (NAPI_GRO_CB(skb)->flush) - goto normal; - - if (unlikely(gro_list->count >= MAX_GRO_SKBS)) - gro_flush_oldest(napi, &gro_list->list); - else - gro_list->count++; - - NAPI_GRO_CB(skb)->count = 1; - NAPI_GRO_CB(skb)->age = jiffies; - NAPI_GRO_CB(skb)->last = skb; - skb_shinfo(skb)->gso_size = skb_gro_len(skb); - list_add(&skb->list, &gro_list->list); - ret = GRO_HELD; - -pull: - grow = skb_gro_offset(skb) - skb_headlen(skb); - if (grow > 0) - gro_pull_from_frag0(skb, grow); -ok: - if (gro_list->count) { - if (!test_bit(bucket, &napi->gro_bitmask)) - __set_bit(bucket, &napi->gro_bitmask); - } else if (test_bit(bucket, &napi->gro_bitmask)) { - __clear_bit(bucket, &napi->gro_bitmask); - } - - return ret; - -normal: - ret = GRO_NORMAL; - goto pull; -} - -struct packet_offload *gro_find_receive_by_type(__be16 type) -{ - struct list_head *offload_head = &offload_base; - struct packet_offload *ptype; - - list_for_each_entry_rcu(ptype, offload_head, list) { - if (ptype->type != type || !ptype->callbacks.gro_receive) - continue; - return ptype; - } - return NULL; -} -EXPORT_SYMBOL(gro_find_receive_by_type); - -struct packet_offload *gro_find_complete_by_type(__be16 type) -{ - struct list_head *offload_head = &offload_base; - struct packet_offload *ptype; - - list_for_each_entry_rcu(ptype, offload_head, list) { - if (ptype->type != type || !ptype->callbacks.gro_complete) - continue; - return ptype; - } - return NULL; -} -EXPORT_SYMBOL(gro_find_complete_by_type); - -static gro_result_t napi_skb_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) -{ - switch (ret) { - case GRO_NORMAL: - gro_normal_one(napi, skb, 1); - break; - - case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) - napi_skb_free_stolen_head(skb); - else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) - __kfree_skb(skb); - else - __kfree_skb_defer(skb); - break; - - case GRO_HELD: - case GRO_MERGED: - case GRO_CONSUMED: - break; - } - - return ret; -} - -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - gro_result_t ret; - - skb_mark_napi_id(skb, napi); - trace_napi_gro_receive_entry(skb); - - skb_gro_reset_offset(skb, 0); - - ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); - trace_napi_gro_receive_exit(ret); - - return ret; -} -EXPORT_SYMBOL(napi_gro_receive); - -static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) -{ - if (unlikely(skb->pfmemalloc)) { - consume_skb(skb); - return; - } - __skb_pull(skb, skb_headlen(skb)); - /* restore the reserve we had after netdev_alloc_skb_ip_align() */ - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - __vlan_hwaccel_clear_tag(skb); - skb->dev = napi->dev; - skb->skb_iif = 0; - - /* eth_type_trans() assumes pkt_type is PACKET_HOST */ - skb->pkt_type = PACKET_HOST; - - skb->encapsulation = 0; - skb_shinfo(skb)->gso_type = 0; - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - if (unlikely(skb->slow_gro)) { - skb_orphan(skb); - skb_ext_reset(skb); - nf_reset_ct(skb); - skb->slow_gro = 0; - } - - napi->skb = skb; -} - -struct sk_buff *napi_get_frags(struct napi_struct *napi) -{ - struct sk_buff *skb = napi->skb; - - if (!skb) { - skb = napi_alloc_skb(napi, GRO_MAX_HEAD); - if (skb) { - napi->skb = skb; - skb_mark_napi_id(skb, napi); - } - } - return skb; -} -EXPORT_SYMBOL(napi_get_frags); - -static gro_result_t napi_frags_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) -{ - switch (ret) { - case GRO_NORMAL: - case GRO_HELD: - __skb_push(skb, ETH_HLEN); - skb->protocol = eth_type_trans(skb, skb->dev); - if (ret == GRO_NORMAL) - gro_normal_one(napi, skb, 1); - break; - - case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) - napi_skb_free_stolen_head(skb); - else - napi_reuse_skb(napi, skb); - break; - - case GRO_MERGED: - case GRO_CONSUMED: - break; - } - - return ret; -} - -/* Upper GRO stack assumes network header starts at gro_offset=0 - * Drivers could call both napi_gro_frags() and napi_gro_receive() - * We copy ethernet header into skb->data to have a common layout. - */ -static struct sk_buff *napi_frags_skb(struct napi_struct *napi) -{ - struct sk_buff *skb = napi->skb; - const struct ethhdr *eth; - unsigned int hlen = sizeof(*eth); - - napi->skb = NULL; - - skb_reset_mac_header(skb); - skb_gro_reset_offset(skb, hlen); - - if (unlikely(skb_gro_header_hard(skb, hlen))) { - eth = skb_gro_header_slow(skb, hlen, 0); - if (unlikely(!eth)) { - net_warn_ratelimited("%s: dropping impossible skb from %s\n", - __func__, napi->dev->name); - napi_reuse_skb(napi, skb); - return NULL; - } - } else { - eth = (const struct ethhdr *)skb->data; - gro_pull_from_frag0(skb, hlen); - NAPI_GRO_CB(skb)->frag0 += hlen; - NAPI_GRO_CB(skb)->frag0_len -= hlen; - } - __skb_pull(skb, hlen); - - /* - * This works because the only protocols we care about don't require - * special handling. - * We'll fix it up properly in napi_frags_finish() - */ - skb->protocol = eth->h_proto; - - return skb; -} - -gro_result_t napi_gro_frags(struct napi_struct *napi) -{ - gro_result_t ret; - struct sk_buff *skb = napi_frags_skb(napi); - - trace_napi_gro_frags_entry(skb); - - ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); - trace_napi_gro_frags_exit(ret); - - return ret; -} -EXPORT_SYMBOL(napi_gro_frags); - -/* Compute the checksum from gro_offset and return the folded value - * after adding in any pseudo checksum. - */ -__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) -{ - __wsum wsum; - __sum16 sum; - - wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); - - /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ - sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); - /* See comments in __skb_checksum_complete(). */ - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev, skb); - } - - NAPI_GRO_CB(skb)->csum = wsum; - NAPI_GRO_CB(skb)->csum_valid = 1; - - return sum; -} -EXPORT_SYMBOL(__skb_gro_checksum_complete); - static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS @@ -6446,7 +5922,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = dev_rx_weight; + napi->weight = READ_ONCE(dev_rx_weight); while (again) { struct sk_buff *skb; @@ -6460,8 +5936,7 @@ static int process_backlog(struct napi_struct *napi, int quota) } - local_irq_disable(); - rps_lock(sd); + rps_lock_irq_disable(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). @@ -6477,8 +5952,7 @@ static int process_backlog(struct napi_struct *napi, int quota) skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } - rps_unlock(sd); - local_irq_enable(); + rps_unlock_irq_enable(sd); } return work; @@ -6888,8 +6362,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) { if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; @@ -6915,6 +6389,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); + napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). @@ -6922,7 +6397,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, if (dev->threaded && napi_kthread_create(napi)) dev->threaded = 0; } -EXPORT_SYMBOL(netif_napi_add); +EXPORT_SYMBOL(netif_napi_add_weight); void napi_disable(struct napi_struct *n) { @@ -7151,12 +6626,34 @@ static int napi_threaded_poll(void *data) return 0; } +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + unsigned long flags; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock_irqsave(&sd->defer_lock, flags); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock_irqrestore(&sd->defer_lock, flags); + + while (skb != NULL) { + next = skb->next; + napi_consume_skb(skb, 1); + skb = next; + } +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(netdev_budget_usecs); - int budget = netdev_budget; + usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); + int budget = READ_ONCE(netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); @@ -7167,9 +6664,11 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) for (;;) { struct napi_struct *n; + skb_defer_free_flush(sd); + if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - return; + goto end; break; } @@ -7196,10 +6695,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); +end:; } struct netdev_adjacent { struct net_device *dev; + netdevice_tracker dev_tracker; /* upper master flag, there can only be one master device per list */ bool master; @@ -7802,6 +7303,16 @@ static int __netdev_update_upper_level(struct net_device *dev, return 0; } +#ifdef CONFIG_LOCKDEP +static LIST_HEAD(net_unlink_list); + +static void net_unlink_todo(struct net_device *dev) +{ + if (list_empty(&dev->unlink_list)) + list_add_tail(&dev->unlink_list, &net_unlink_list); +} +#endif + static int __netdev_update_lower_level(struct net_device *dev, struct netdev_nested_priv *priv) { @@ -7964,7 +7475,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->ref_nr = 1; adj->private = private; adj->ignore = false; - dev_hold(adj_dev); + netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); @@ -7993,8 +7504,8 @@ remove_symlinks: if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: + netdev_put(adj_dev, &adj->dev_tracker); kfree(adj); - dev_put(adj_dev); return ret; } @@ -8035,7 +7546,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, list_del_rcu(&adj->list); pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); - dev_put(adj_dev); + netdev_put(adj_dev, &adj->dev_tracker); kfree_rcu(adj, rcu); } @@ -8384,6 +7895,242 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +static int netdev_offload_xstats_enable_l3(struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + int err; + int rc; + + dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), + GFP_KERNEL); + if (!dev->offload_xstats_l3) + return -ENOMEM; + + rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, + NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + err = notifier_to_errno(rc); + if (err) + goto free_stats; + + return 0; + +free_stats: + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; + return err; +} + +int netdev_offload_xstats_enable(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return netdev_offload_xstats_enable_l3(dev, extack); + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_enable); + +static void netdev_offload_xstats_disable_l3(struct net_device *dev) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + + call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; +} + +int netdev_offload_xstats_disable(struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + if (!netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + netdev_offload_xstats_disable_l3(dev); + return 0; + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_disable); + +static void netdev_offload_xstats_disable_all(struct net_device *dev) +{ + netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); +} + +static struct rtnl_hw_stats64 * +netdev_offload_xstats_get_ptr(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return dev->offload_xstats_l3; + } + + WARN_ON(1); + return NULL; +} + +bool netdev_offload_xstats_enabled(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + return netdev_offload_xstats_get_ptr(dev, type); +} +EXPORT_SYMBOL(netdev_offload_xstats_enabled); + +struct netdev_notifier_offload_xstats_ru { + bool used; +}; + +struct netdev_notifier_offload_xstats_rd { + struct rtnl_hw_stats64 stats; + bool used; +}; + +static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, + const struct rtnl_hw_stats64 *src) +{ + dest->rx_packets += src->rx_packets; + dest->tx_packets += src->tx_packets; + dest->rx_bytes += src->rx_bytes; + dest->tx_bytes += src->tx_bytes; + dest->rx_errors += src->rx_errors; + dest->tx_errors += src->tx_errors; + dest->rx_dropped += src->rx_dropped; + dest->tx_dropped += src->tx_dropped; + dest->multicast += src->multicast; +} + +static int netdev_offload_xstats_get_used(struct net_device *dev, + enum netdev_offload_xstats_type type, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_ru report_used = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_used = &report_used, + }; + int rc; + + WARN_ON(!netdev_offload_xstats_enabled(dev, type)); + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, + &info.info); + *p_used = report_used.used; + return notifier_to_errno(rc); +} + +static int netdev_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_rd report_delta = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_delta = &report_delta, + }; + struct rtnl_hw_stats64 *stats; + int rc; + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return -EINVAL; + + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, + &info.info); + + /* Cache whatever we got, even if there was an error, otherwise the + * successful stats retrievals would get lost. + */ + netdev_hw_stats64_add(stats, &report_delta.stats); + + if (p_stats) + *p_stats = *stats; + *p_used = report_delta.used; + + return notifier_to_errno(rc); +} + +int netdev_offload_xstats_get(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, bool *p_used, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (p_stats) + return netdev_offload_xstats_get_stats(dev, type, p_stats, + p_used, extack); + else + return netdev_offload_xstats_get_used(dev, type, p_used, + extack); +} +EXPORT_SYMBOL(netdev_offload_xstats_get); + +void +netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, + const struct rtnl_hw_stats64 *stats) +{ + report_delta->used = true; + netdev_hw_stats64_add(&report_delta->stats, stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_report_delta); + +void +netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) +{ + report_used->used = true; +} +EXPORT_SYMBOL(netdev_offload_xstats_report_used); + +void netdev_offload_xstats_push_delta(struct net_device *dev, + enum netdev_offload_xstats_type type, + const struct rtnl_hw_stats64 *p_stats) +{ + struct rtnl_hw_stats64 *stats; + + ASSERT_RTNL(); + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return; + + netdev_hw_stats64_add(stats, p_stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_push_delta); + /** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device @@ -9004,7 +8751,6 @@ void dev_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } -EXPORT_SYMBOL(dev_set_group); /** * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. @@ -9119,7 +8865,6 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier) return -ENODEV; return ops->ndo_change_carrier(dev, new_carrier); } -EXPORT_SYMBOL(dev_change_carrier); /** * dev_get_phys_port_id - Get device physical port ID @@ -9137,7 +8882,6 @@ int dev_get_phys_port_id(struct net_device *dev, return -EOPNOTSUPP; return ops->ndo_get_phys_port_id(dev, ppid); } -EXPORT_SYMBOL(dev_get_phys_port_id); /** * dev_get_phys_port_name - Get device physical port name @@ -9160,7 +8904,6 @@ int dev_get_phys_port_name(struct net_device *dev, } return devlink_compat_phys_port_name_get(dev, name, len); } -EXPORT_SYMBOL(dev_get_phys_port_name); /** * dev_get_port_parent_id - Get the device's port parent identifier @@ -9224,35 +8967,17 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) EXPORT_SYMBOL(netdev_port_same_parent_id); /** - * dev_change_proto_down - update protocol port state information + * dev_change_proto_down - set carrier according to proto_down. + * * @dev: device * @proto_down: new value - * - * This info can be used by switch drivers to set the phys state of the - * port. */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { - const struct net_device_ops *ops = dev->netdev_ops; - - if (!ops->ndo_change_proto_down) + if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; - return ops->ndo_change_proto_down(dev, proto_down); -} -EXPORT_SYMBOL(dev_change_proto_down); - -/** - * dev_change_proto_down_generic - generic implementation for - * ndo_change_proto_down that sets carrier according to - * proto_down. - * - * @dev: device - * @proto_down: new value - */ -int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) -{ if (proto_down) netif_carrier_off(dev); else @@ -9260,7 +8985,6 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) dev->proto_down = proto_down; return 0; } -EXPORT_SYMBOL(dev_change_proto_down_generic); /** * dev_change_proto_down_reason - proto down reason @@ -9285,7 +9009,6 @@ void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, } } } -EXPORT_SYMBOL(dev_change_proto_down_reason); struct bpf_xdp_link { struct bpf_link link; @@ -9656,6 +9379,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, goto out_unlock; } old_prog = link->prog; + if (old_prog->type != new_prog->type || + old_prog->expected_attach_type != new_prog->expected_attach_type) { + err = -EINVAL; + goto out_unlock; + } + if (old_prog == new_prog) { /* no-op, don't disturb drivers */ bpf_prog_put(new_prog); @@ -9806,13 +9535,13 @@ static int dev_new_index(struct net *net) } /* Delayed registration/unregisteration */ -static LIST_HEAD(net_todo_list); +LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); - dev_net(dev)->dev_unreg_count++; + atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -10213,22 +9942,14 @@ void netif_tx_stop_all_queues(struct net_device *dev) EXPORT_SYMBOL(netif_tx_stop_all_queues); /** - * register_netdevice - register a network device - * @dev: device to register - * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * Callers must hold the rtnl semaphore. You may want - * register_netdev() instead of this. + * register_netdevice() - register a network device + * @dev: device to register * - * BUGS: - * The locking appears insufficient to guarantee two parallel registers - * will not get the same name. + * Take a prepared network device structure and make it externally accessible. + * A %NETDEV_REGISTER message is sent to the netdev notifier chain. + * Callers must hold the rtnl lock - you may want register_netdev() + * instead of this. */ - int register_netdevice(struct net_device *dev) { int ret; @@ -10334,11 +10055,11 @@ int register_netdevice(struct net_device *dev) goto err_uninit; ret = netdev_register_kobject(dev); - if (ret) { - dev->reg_state = NETREG_UNREGISTERED; + write_lock(&dev_base_lock); + dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; + write_unlock(&dev_base_lock); + if (ret) goto err_uninit; - } - dev->reg_state = NETREG_REGISTERED; __netdev_update_features(dev); @@ -10352,8 +10073,10 @@ int register_netdevice(struct net_device *dev) linkwatch_init_dev(dev); dev_init_scheduler(dev); - dev_hold(dev); + + netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should @@ -10482,8 +10205,8 @@ int netdev_unregister_timeout_secs __read_mostly = 10; #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** - * netdev_wait_allrefs - wait until all references are gone. - * @dev: target net_device + * netdev_wait_allrefs_any - wait until all references are gone. + * @list: list of net_devices to wait on * * This is called when unregistering network devices. * @@ -10493,37 +10216,42 @@ int netdev_unregister_timeout_secs __read_mostly = 10; * We can get stuck here if buggy protocols don't correctly * call dev_put. */ -static void netdev_wait_allrefs(struct net_device *dev) +static struct net_device *netdev_wait_allrefs_any(struct list_head *list) { unsigned long rebroadcast_time, warning_time; - int wait = 0, refcnt; - - linkwatch_forget_dev(dev); + struct net_device *dev; + int wait = 0; rebroadcast_time = warning_time = jiffies; - refcnt = netdev_refcnt_read(dev); - while (refcnt != 1) { + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + while (true) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + list_for_each_entry(dev, list, todo_list) + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); - if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } + list_for_each_entry(dev, list, todo_list) + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + break; + } __rtnl_unlock(); @@ -10538,13 +10266,18 @@ static void netdev_wait_allrefs(struct net_device *dev) wait = min(wait << 1, WAIT_REFS_MAX_MSECS); } - refcnt = netdev_refcnt_read(dev); + list_for_each_entry(dev, list, todo_list) + if (netdev_refcnt_read(dev) == 1) + return dev; + + if (time_after(jiffies, warning_time + + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { + list_for_each_entry(dev, list, todo_list) { + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, netdev_refcnt_read(dev)); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); + } - if (refcnt != 1 && - time_after(jiffies, warning_time + - netdev_unregister_timeout_secs * HZ)) { - pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", - dev->name, refcnt); warning_time = jiffies; } } @@ -10576,6 +10309,7 @@ static void netdev_wait_allrefs(struct net_device *dev) */ void netdev_run_todo(void) { + struct net_device *dev, *tmp; struct list_head list; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -10596,26 +10330,26 @@ void netdev_run_todo(void) __rtnl_unlock(); - /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); - while (!list_empty(&list)) { - struct net_device *dev - = list_first_entry(&list, struct net_device, todo_list); - list_del(&dev->todo_list); - + list_for_each_entry_safe(dev, tmp, &list, todo_list) { if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { - pr_err("network todo '%s' but state %d\n", - dev->name, dev->reg_state); - dump_stack(); + netdev_WARN(dev, "run_todo but not unregistering\n"); + list_del(&dev->todo_list); continue; } + write_lock(&dev_base_lock); dev->reg_state = NETREG_UNREGISTERED; + write_unlock(&dev_base_lock); + linkwatch_forget_dev(dev); + } - netdev_wait_allrefs(dev); + while (!list_empty(&list)) { + dev = netdev_wait_allrefs_any(&list); + list_del(&dev->todo_list); /* paranoia */ BUG_ON(netdev_refcnt_read(dev) != 1); @@ -10623,19 +10357,14 @@ void netdev_run_todo(void) BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); -#if IS_ENABLED(CONFIG_DECNET) - WARN_ON(dev->dn_ptr); -#endif + if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) free_netdev(dev); - /* Report a network device has been unregistered */ - rtnl_lock(); - dev_net(dev)->dev_unreg_count--; - __rtnl_unlock(); - wake_up(&netdev_unregistering_wq); + if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) + wake_up(&netdev_unregistering_wq); /* Free network device */ kobject_put(&dev->dev.kobj); @@ -10671,6 +10400,21 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, } EXPORT_SYMBOL(netdev_stats_to_stats64); +struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev) +{ + struct net_device_core_stats __percpu *p; + + p = alloc_percpu_gfp(struct net_device_core_stats, + GFP_ATOMIC | __GFP_NOWARN); + + if (p && cmpxchg(&dev->core_stats, NULL, p)) + free_percpu(p); + + /* This READ_ONCE() pairs with the cmpxchg() above */ + return READ_ONCE(dev->core_stats); +} +EXPORT_SYMBOL(netdev_core_stats_alloc); + /** * dev_get_stats - get network device statistics * @dev: device to get statistics from @@ -10685,6 +10429,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; + const struct net_device_core_stats __percpu *p; if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); @@ -10694,9 +10439,21 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } else { netdev_stats_to_stats64(storage, &dev->stats); } - storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped); - storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped); - storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler); + + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + p = READ_ONCE(dev->core_stats); + if (p) { + const struct net_device_core_stats *core_stats; + int i; + + for_each_possible_cpu(i) { + core_stats = per_cpu_ptr(p, i); + storage->rx_dropped += READ_ONCE(core_stats->rx_dropped); + storage->tx_dropped += READ_ONCE(core_stats->tx_dropped); + storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler); + storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped); + } + } return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -10714,23 +10471,23 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, int cpu; for_each_possible_cpu(cpu) { + u64 rx_packets, rx_bytes, tx_packets, tx_bytes; const struct pcpu_sw_netstats *stats; - struct pcpu_sw_netstats tmp; unsigned int start; stats = per_cpu_ptr(netstats, cpu); do { start = u64_stats_fetch_begin_irq(&stats->syncp); - tmp.rx_packets = stats->rx_packets; - tmp.rx_bytes = stats->rx_bytes; - tmp.tx_packets = stats->tx_packets; - tmp.tx_bytes = stats->tx_bytes; + rx_packets = u64_stats_read(&stats->rx_packets); + rx_bytes = u64_stats_read(&stats->rx_bytes); + tx_packets = u64_stats_read(&stats->tx_packets); + tx_bytes = u64_stats_read(&stats->tx_bytes); } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); - s->rx_packets += tmp.rx_packets; - s->rx_bytes += tmp.rx_bytes; - s->tx_packets += tmp.tx_packets; - s->tx_bytes += tmp.tx_bytes; + s->rx_packets += rx_packets; + s->rx_bytes += rx_bytes; + s->tx_packets += tx_packets; + s->tx_bytes += tx_bytes; } } EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); @@ -10835,11 +10592,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; + ref_tracker_dir_init(&dev->refcnt_tracker, 128); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; - dev_hold(dev); + __dev_hold(dev); #else refcount_set(&dev->dev_refcnt, 1); #endif @@ -10852,8 +10610,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->tso_max_size = TSO_LEGACY_MAX_SIZE; + dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP @@ -10951,10 +10712,13 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; #endif + free_percpu(dev->core_stats); + dev->core_stats = NULL; free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; @@ -11058,9 +10822,10 @@ void unregister_netdevice_many(struct list_head *head) list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - unlist_netdevice(dev); - + write_lock(&dev_base_lock); + unlist_netdevice(dev, false); dev->reg_state = NETREG_UNREGISTERING; + write_unlock(&dev_base_lock); } flush_all_backlogs(); @@ -11074,6 +10839,8 @@ void unregister_netdevice_many(struct list_head *head) dev_xdp_uninstall(dev); + netdev_offload_xstats_disable_all(dev); + /* Notify protocols, that we are about to destroy * this device. They should clean all the things. */ @@ -11114,7 +10881,7 @@ void unregister_netdevice_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { - dev_put(dev); + netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); } @@ -11205,7 +10972,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev); + unlist_netdevice(dev, true); synchronize_net(); @@ -11339,11 +11106,11 @@ static int dev_cpu_dead(unsigned int oldcpu) /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { - netif_rx_ni(skb); + netif_rx(skb); input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { - netif_rx_ni(skb); + netif_rx(skb); input_queue_head_incr(oldsd); } @@ -11397,8 +11164,7 @@ static int __net_init netdev_init(struct net *net) BUILD_BUG_ON(GRO_HASH_BUCKETS > 8 * sizeof_field(struct napi_struct, gro_bitmask)); - if (net != &init_net) - INIT_LIST_HEAD(&net->dev_base_head); + INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) @@ -11514,14 +11280,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = { .exit = netdev_exit, }; -static void __net_exit default_device_exit(struct net *net) +static void __net_exit default_device_exit_net(struct net *net) { struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ - rtnl_lock(); + ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; @@ -11545,35 +11311,6 @@ static void __net_exit default_device_exit(struct net *net) BUG(); } } - rtnl_unlock(); -} - -static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) -{ - /* Return with the rtnl_lock held when there are no network - * devices unregistering in any network namespace in net_list. - */ - struct net *net; - bool unregistering; - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(&netdev_unregistering_wq, &wait); - for (;;) { - unregistering = false; - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) { - if (net->dev_unreg_count > 0) { - unregistering = true; - break; - } - } - if (!unregistering) - break; - __rtnl_unlock(); - - wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); - } - remove_wait_queue(&netdev_unregistering_wq, &wait); } static void __net_exit default_device_exit_batch(struct list_head *net_list) @@ -11587,18 +11324,12 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); - /* To prevent network device cleanup code from dereferencing - * loopback devices or network devices that have been freed - * wait here for all pending unregistrations to complete, - * before unregistring the loopback device and allowing the - * network namespace be freed. - * - * The netdev todo list containing all network devices - * unregistrations that happen in default_device_exit_batch - * will run in the rtnl_unlock() at the end of - * default_device_exit_batch. - */ - rtnl_lock_unregistering(net_list); + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + default_device_exit_net(net); + cond_resched(); + } + list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) @@ -11612,7 +11343,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } static struct pernet_operations __net_initdata default_device_ops = { - .exit = default_device_exit, .exit_batch = default_device_exit_batch, }; @@ -11643,8 +11373,6 @@ static int __init net_dev_init(void) for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); - INIT_LIST_HEAD(&offload_base); - if (register_pernet_subsys(&netdev_net_ops)) goto out; @@ -11669,6 +11397,8 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); + spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; diff --git a/net/core/dev.h b/net/core/dev.h new file mode 100644 index 000000000000..cbb8a925175a --- /dev/null +++ b/net/core/dev.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_CORE_DEV_H +#define _NET_CORE_DEV_H + +#include <linux/types.h> + +struct net; +struct net_device; +struct netdev_bpf; +struct netdev_phys_item_id; +struct netlink_ext_ack; + +/* Random bits of netdevice that don't need to be exposed */ +#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ +struct sd_flow_limit { + u64 count; + unsigned int num_buckets; + unsigned int history_head; + u16 history[FLOW_LIMIT_HISTORY]; + u8 buckets[]; +}; + +extern int netdev_flow_limit_table_len; + +#ifdef CONFIG_PROC_FS +int __init dev_proc_init(void); +#else +#define dev_proc_init() 0 +#endif + +void linkwatch_init_dev(struct net_device *dev); +void linkwatch_forget_dev(struct net_device *dev); +void linkwatch_run_queue(void); + +void dev_addr_flush(struct net_device *dev); +int dev_addr_init(struct net_device *dev); +void dev_addr_check(struct net_device *dev); + +/* sysctls not referred to from outside net/core/ */ +extern int netdev_budget; +extern unsigned int netdev_budget_usecs; +extern unsigned int sysctl_skb_defer_max; +extern int netdev_tstamp_prequeue; +extern int netdev_unregister_timeout_secs; +extern int weight_p; +extern int dev_weight_rx_bias; +extern int dev_weight_tx_bias; + +/* rtnl helpers */ +extern struct list_head net_todo_list; +void netdev_run_todo(void); + +/* netdev management, shared between various uAPI entry points */ +struct netdev_name_node { + struct hlist_node hlist; + struct list_head list; + struct net_device *dev; + const char *name; +}; + +int netdev_get_name(struct net *net, char *name, int ifindex); +int dev_change_name(struct net_device *dev, const char *newname); + +int netdev_name_node_alt_create(struct net_device *dev, const char *name); +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); + +int dev_validate_mtu(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); +int dev_set_mtu_ext(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); + +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_item_id *ppid); +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len); + +int dev_change_proto_down(struct net_device *dev, bool proto_down); +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value); + +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags); + +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); +void dev_set_group(struct net_device *dev, int new_group); +int dev_change_carrier(struct net_device *dev, bool new_carrier); + +void __dev_set_rx_mode(struct net_device *dev); + +static inline void netif_set_gso_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); +} + +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + +#endif diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index f0cb38344126..baa63dee2829 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -12,10 +12,41 @@ #include <linux/export.h> #include <linux/list.h> +#include "dev.h" + /* * General list handling functions */ +static int __hw_addr_insert(struct netdev_hw_addr_list *list, + struct netdev_hw_addr *new, int addr_len) +{ + struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; + struct netdev_hw_addr *ha; + + while (*ins_point) { + int diff; + + ha = rb_entry(*ins_point, struct netdev_hw_addr, node); + diff = memcmp(new->addr, ha->addr, addr_len); + if (diff == 0) + diff = memcmp(&new->type, &ha->type, sizeof(new->type)); + + parent = *ins_point; + if (diff < 0) + ins_point = &parent->rb_left; + else if (diff > 0) + ins_point = &parent->rb_right; + else + return -EEXIST; + } + + rb_link_node_rcu(&new->node, parent, ins_point); + rb_insert_color(&new->node, &list->tree); + + return 0; +} + static struct netdev_hw_addr* __hw_addr_create(const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) @@ -50,11 +81,6 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, if (addr_len > MAX_ADDR_LEN) return -EINVAL; - ha = list_first_entry(&list->list, struct netdev_hw_addr, list); - if (ha && !memcmp(addr, ha->addr, addr_len) && - (!addr_type || addr_type == ha->type)) - goto found_it; - while (*ins_point) { int diff; @@ -69,7 +95,6 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, } else if (diff > 0) { ins_point = &parent->rb_right; } else { -found_it: if (exclusive) return -EEXIST; if (global) { @@ -94,16 +119,8 @@ found_it: if (!ha) return -ENOMEM; - /* The first address in dev->dev_addrs is pointed to by dev->dev_addr - * and mutated freely by device drivers and netdev ops, so if we insert - * it into the tree we'll end up with an invalid rbtree. - */ - if (list->count > 0) { - rb_link_node(&ha->node, parent, ins_point); - rb_insert_color(&ha->node, &list->tree); - } else { - RB_CLEAR_NODE(&ha->node); - } + rb_link_node(&ha->node, parent, ins_point); + rb_insert_color(&ha->node, &list->tree); list_add_tail_rcu(&ha->list, &list->list); list->count++; @@ -138,8 +155,7 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, if (--ha->refcount) return 0; - if (!RB_EMPTY_NODE(&ha->node)) - rb_erase(&ha->node, &list->tree); + rb_erase(&ha->node, &list->tree); list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); @@ -151,18 +167,8 @@ static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { - struct netdev_hw_addr *ha; struct rb_node *node; - /* The first address isn't inserted into the tree because in the dev->dev_addrs - * list it's the address pointed to by dev->dev_addr which is freely mutated - * in place, so we need to check it separately. - */ - ha = list_first_entry(&list->list, struct netdev_hw_addr, list); - if (ha && !memcmp(addr, ha->addr, addr_len) && - (!addr_type || addr_type == ha->type)) - return ha; - node = list->tree.rb_node; while (node) { @@ -498,6 +504,21 @@ EXPORT_SYMBOL(__hw_addr_init); * Device addresses handling functions */ +/* Check that netdev->dev_addr is not written to directly as this would + * break the rbtree layout. All changes should go thru dev_addr_set() and co. + * Remove this check in mid-2024. + */ +void dev_addr_check(struct net_device *dev) +{ + if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN)) + return; + + netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr); + netdev_warn(dev, "Expected addr: %*ph\n", + MAX_ADDR_LEN, dev->dev_addr_shadow); + netdev_WARN(dev, "Incorrect netdev->dev_addr\n"); +} + /** * dev_addr_flush - Flush device address list * @dev: device @@ -509,11 +530,11 @@ EXPORT_SYMBOL(__hw_addr_init); void dev_addr_flush(struct net_device *dev) { /* rtnl_mutex must be held here */ + dev_addr_check(dev); __hw_addr_flush(&dev->dev_addrs); dev->dev_addr = NULL; } -EXPORT_SYMBOL(dev_addr_flush); /** * dev_addr_init - Init device address list @@ -547,7 +568,21 @@ int dev_addr_init(struct net_device *dev) } return err; } -EXPORT_SYMBOL(dev_addr_init); + +void dev_addr_mod(struct net_device *dev, unsigned int offset, + const void *addr, size_t len) +{ + struct netdev_hw_addr *ha; + + dev_addr_check(dev); + + ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]); + rb_erase(&ha->node, &dev->dev_addrs.tree); + memcpy(&ha->addr[offset], addr, len); + memcpy(&dev->dev_addr_shadow[offset], addr, len); + WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len)); +} +EXPORT_SYMBOL(dev_addr_mod); /** * dev_addr_add - Add a device address diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c new file mode 100644 index 000000000000..049cfbc58aa9 --- /dev/null +++ b/net/core/dev_addr_lists_test.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <kunit/test.h> +#include <linux/etherdevice.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> + +static const struct net_device_ops dummy_netdev_ops = { +}; + +struct dev_addr_test_priv { + u32 addr_seen; +}; + +static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a) +{ + struct dev_addr_test_priv *datp = netdev_priv(netdev); + + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + datp->addr_seen |= 1 << a[0]; + return 0; +} + +static int dev_addr_test_unsync(struct net_device *netdev, + const unsigned char *a) +{ + struct dev_addr_test_priv *datp = netdev_priv(netdev); + + if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN)) + datp->addr_seen &= ~(1 << a[0]); + return 0; +} + +static int dev_addr_test_init(struct kunit *test) +{ + struct dev_addr_test_priv *datp; + struct net_device *netdev; + int err; + + netdev = alloc_etherdev(sizeof(*datp)); + KUNIT_ASSERT_TRUE(test, !!netdev); + + test->priv = netdev; + netdev->netdev_ops = &dummy_netdev_ops; + + err = register_netdev(netdev); + if (err) { + free_netdev(netdev); + KUNIT_FAIL(test, "Can't register netdev %d", err); + } + + rtnl_lock(); + return 0; +} + +static void dev_addr_test_exit(struct kunit *test) +{ + struct net_device *netdev = test->priv; + + rtnl_unlock(); + unregister_netdev(netdev); + free_netdev(netdev); +} + +static void dev_addr_test_basic(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + + KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr); + + memset(addr, 2, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr))); + + memset(addr, 3, sizeof(addr)); + dev_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr))); +} + +static void dev_addr_test_sync_one(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + + datp = netdev_priv(netdev); + + memset(addr, 1, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 2, datp->addr_seen); + + memset(addr, 2, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + datp->addr_seen = 0; + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + /* It's not going to sync anything because the main address is + * considered synced and we overwrite in place. + */ + KUNIT_EXPECT_EQ(test, 0, datp->addr_seen); +} + +static void dev_addr_test_add_del(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + int i; + + datp = netdev_priv(netdev); + + for (i = 1; i < 4; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + /* Add 3 again */ + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen); + + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen); + + for (i = 1; i < 4; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 1, datp->addr_seen); +} + +static void dev_addr_test_del_main(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + + memset(addr, 1, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); +} + +static void dev_addr_test_add_set(struct kunit *test) +{ + struct net_device *netdev = test->priv; + struct dev_addr_test_priv *datp; + u8 addr[ETH_ALEN]; + int i; + + datp = netdev_priv(netdev); + + /* There is no external API like dev_addr_add_excl(), + * so shuffle the tree a little bit and exploit aliasing. + */ + for (i = 1; i < 16; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + } + + memset(addr, i, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr, + NETDEV_HW_ADDR_T_LAN)); + memset(addr, 0, sizeof(addr)); + eth_hw_addr_set(netdev, addr); + + __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync, + dev_addr_test_unsync); + KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen); +} + +static void dev_addr_test_add_excl(struct kunit *test) +{ + struct net_device *netdev = test->priv; + u8 addr[ETH_ALEN]; + int i; + + for (i = 0; i < 10; i++) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr)); + } + KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); + + for (i = 0; i < 10; i += 2) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr)); + } + for (i = 1; i < 10; i += 2) { + memset(addr, i, sizeof(addr)); + KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr)); + } +} + +static struct kunit_case dev_addr_test_cases[] = { + KUNIT_CASE(dev_addr_test_basic), + KUNIT_CASE(dev_addr_test_sync_one), + KUNIT_CASE(dev_addr_test_add_del), + KUNIT_CASE(dev_addr_test_del_main), + KUNIT_CASE(dev_addr_test_add_set), + KUNIT_CASE(dev_addr_test_add_excl), + {} +}; + +static struct kunit_suite dev_addr_test_suite = { + .name = "dev-addr-list-test", + .test_cases = dev_addr_test_cases, + .init = dev_addr_test_init, + .exit = dev_addr_test_exit, +}; +kunit_test_suite(dev_addr_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index cbab5fec64b1..7674bb9f3076 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -10,6 +10,8 @@ #include <net/dsa.h> #include <net/wext.h> +#include "dev.h" + /* * Map an interface index to its name (SIOCGIFNAME) */ @@ -192,7 +194,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr) if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) return -EFAULT; - if (cfg.flags) /* reserved for future extensions */ + if (cfg.flags & ~HWTSTAMP_FLAG_MASK) return -EINVAL; tx_type = cfg.tx_type; @@ -313,6 +315,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; + netdevice_tracker dev_tracker; if (!dev) return -ENODEV; @@ -381,10 +384,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return -ENODEV; if (!netif_is_bridge_master(dev)) return -EOPNOTSUPP; - dev_hold(dev); + netdev_hold(dev, &dev_tracker, GFP_KERNEL); rtnl_unlock(); err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); - dev_put(dev); + netdev_put(dev, &dev_tracker); rtnl_lock(); return err; diff --git a/net/core/devlink.c b/net/core/devlink.c index c06c9ba6e8c5..89baa7c0938b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7,6 +7,7 @@ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ +#include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> @@ -53,6 +54,8 @@ struct devlink { struct list_head trap_list; struct list_head trap_group_list; struct list_head trap_policer_list; + struct list_head linecard_list; + struct mutex linecards_lock; /* protects linecard_list */ const struct devlink_ops *ops; u64 features; struct xarray snapshot_ids; @@ -63,12 +66,61 @@ struct devlink { * port, sb, dpipe, resource, params, region, traps and more. */ struct mutex lock; + struct lock_class_key lock_key; u8 reload_failed:1; refcount_t refcount; struct completion comp; + struct rcu_head rcu; char priv[] __aligned(NETDEV_ALIGN); }; +struct devlink_linecard_ops; +struct devlink_linecard_type; + +struct devlink_linecard { + struct list_head list; + struct devlink *devlink; + unsigned int index; + refcount_t refcount; + const struct devlink_linecard_ops *ops; + void *priv; + enum devlink_linecard_state state; + struct mutex state_lock; /* Protects state */ + const char *type; + struct devlink_linecard_type *types; + unsigned int types_count; + struct devlink *nested_devlink; +}; + +/** + * struct devlink_resource - devlink resource + * @name: name of the resource + * @id: id, per devlink instance + * @size: size of the resource + * @size_new: updated size of the resource, reload is needed + * @size_valid: valid in case the total size of the resource is valid + * including its children + * @parent: parent resource + * @size_params: size parameters + * @list: parent list + * @resource_list: list of child resources + * @occ_get: occupancy getter callback + * @occ_get_priv: occupancy getter callback priv + */ +struct devlink_resource { + const char *name; + u64 id; + u64 size; + u64 size_new; + bool size_valid; + struct devlink_resource *parent; + struct devlink_resource_size_params size_params; + struct list_head list; + struct list_head resource_list; + devlink_resource_occ_get_t *occ_get; + void *occ_get_priv; +}; + void *devlink_priv(struct devlink *devlink) { return &devlink->priv; @@ -149,8 +201,13 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ DEVLINK_PORT_FN_STATE_ACTIVE), }; +static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = { + [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG }, +}; + static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); #define DEVLINK_REGISTERED XA_MARK_1 +#define DEVLINK_UNREGISTERING XA_MARK_2 /* devlink instances are open to the access from the user space after * devlink_register() call. Such logical barrier allows us to have certain @@ -168,24 +225,27 @@ static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); #define ASSERT_DEVLINK_NOT_REGISTERED(d) \ WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) -/* devlink_mutex - * - * An overall lock guarding every operation coming from userspace. - * It also guards devlink devices list and it is taken when - * driver registers/unregisters it. - */ -static DEFINE_MUTEX(devlink_mutex); - struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } EXPORT_SYMBOL_GPL(devlink_net); +static void __devlink_put_rcu(struct rcu_head *head) +{ + struct devlink *devlink = container_of(head, struct devlink, rcu); + + complete(&devlink->comp); +} + void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) - complete(&devlink->comp); + /* Make sure unregister operation that may await the completion + * is unblocked only after all users are after the end of + * RCU grace period. + */ + call_rcu(&devlink->rcu, __devlink_put_rcu); } struct devlink *__must_check devlink_try_get(struct devlink *devlink) @@ -195,12 +255,103 @@ struct devlink *__must_check devlink_try_get(struct devlink *devlink) return NULL; } +void devl_assert_locked(struct devlink *devlink) +{ + lockdep_assert_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_assert_locked); + +#ifdef CONFIG_LOCKDEP +/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ +bool devl_lock_is_held(struct devlink *devlink) +{ + return lockdep_is_held(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock_is_held); +#endif + +void devl_lock(struct devlink *devlink) +{ + mutex_lock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_lock); + +int devl_trylock(struct devlink *devlink) +{ + return mutex_trylock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_trylock); + +void devl_unlock(struct devlink *devlink) +{ + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_unlock); + +static struct devlink * +devlinks_xa_find_get(struct net *net, unsigned long *indexp, xa_mark_t filter, + void * (*xa_find_fn)(struct xarray *, unsigned long *, + unsigned long, xa_mark_t)) +{ + struct devlink *devlink; + + rcu_read_lock(); +retry: + devlink = xa_find_fn(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); + if (!devlink) + goto unlock; + + /* In case devlink_unregister() was already called and "unregistering" + * mark was set, do not allow to get a devlink reference here. + * This prevents live-lock of devlink_unregister() wait for completion. + */ + if (xa_get_mark(&devlinks, *indexp, DEVLINK_UNREGISTERING)) + goto retry; + + /* For a possible retry, the xa_find_after() should be always used */ + xa_find_fn = xa_find_after; + if (!devlink_try_get(devlink)) + goto retry; + if (!net_eq(devlink_net(devlink), net)) { + devlink_put(devlink); + goto retry; + } +unlock: + rcu_read_unlock(); + return devlink; +} + +static struct devlink *devlinks_xa_find_get_first(struct net *net, + unsigned long *indexp, + xa_mark_t filter) +{ + return devlinks_xa_find_get(net, indexp, filter, xa_find); +} + +static struct devlink *devlinks_xa_find_get_next(struct net *net, + unsigned long *indexp, + xa_mark_t filter) +{ + return devlinks_xa_find_get(net, indexp, filter, xa_find_after); +} + +/* Iterate over devlink pointers which were possible to get reference to. + * devlink_put() needs to be called for each iterated devlink pointer + * in loop body in order to release the reference. + */ +#define devlinks_xa_for_each_get(net, index, devlink, filter) \ + for (index = 0, \ + devlink = devlinks_xa_find_get_first(net, &index, filter); \ + devlink; devlink = devlinks_xa_find_get_next(net, &index, filter)) + +#define devlinks_xa_for_each_registered_get(net, index, devlink) \ + devlinks_xa_for_each_get(net, index, devlink, DEVLINK_REGISTERED) + static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { struct devlink *devlink; unsigned long index; - bool found = false; char *busname; char *devname; @@ -210,23 +361,23 @@ static struct devlink *devlink_get_from_attrs(struct net *net, busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]); devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); - lockdep_assert_held(&devlink_mutex); - - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + devlinks_xa_for_each_registered_get(net, index, devlink) { if (strcmp(devlink->dev->bus->name, busname) == 0 && - strcmp(dev_name(devlink->dev), devname) == 0 && - net_eq(devlink_net(devlink), net)) { - found = true; - break; - } + strcmp(dev_name(devlink->dev), devname) == 0) + return devlink; + devlink_put(devlink); } - if (!found || !devlink_try_get(devlink)) - devlink = ERR_PTR(-ENODEV); - - return devlink; + return ERR_PTR(-ENODEV); } +#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ + WARN_ON_ONCE(!(devlink_port)->registered) +#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ + WARN_ON_ONCE((devlink_port)->registered) +#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ + WARN_ON_ONCE(!(devlink_port)->initialized) + static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { @@ -340,6 +491,58 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) return ERR_PTR(-EINVAL); } +static struct devlink_linecard * +devlink_linecard_get_by_index(struct devlink *devlink, + unsigned int linecard_index) +{ + struct devlink_linecard *devlink_linecard; + + list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) { + if (devlink_linecard->index == linecard_index) + return devlink_linecard; + } + return NULL; +} + +static bool devlink_linecard_index_exists(struct devlink *devlink, + unsigned int linecard_index) +{ + return devlink_linecard_get_by_index(devlink, linecard_index); +} + +static struct devlink_linecard * +devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) { + u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]); + struct devlink_linecard *linecard; + + mutex_lock(&devlink->linecards_lock); + linecard = devlink_linecard_get_by_index(devlink, linecard_index); + if (linecard) + refcount_inc(&linecard->refcount); + mutex_unlock(&devlink->linecards_lock); + if (!linecard) + return ERR_PTR(-ENODEV); + return linecard; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_linecard * +devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info) +{ + return devlink_linecard_get_from_attrs(devlink, info->attrs); +} + +static void devlink_linecard_put(struct devlink_linecard *linecard) +{ + if (refcount_dec_and_test(&linecard->refcount)) { + mutex_destroy(&linecard->state_lock); + kfree(linecard); + } +} + struct devlink_sb { struct list_head list; unsigned int index; @@ -506,6 +709,10 @@ struct devlink_region { const struct devlink_region_ops *ops; const struct devlink_port_region_ops *port_ops; }; + struct mutex snapshot_lock; /* protects snapshot_list, + * max_snapshots and cur_snapshots + * consistency. + */ struct list_head snapshot_list; u32 max_snapshots; u32 cur_snapshots; @@ -560,28 +767,20 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_RATE BIT(2) #define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) - -/* The per devlink instance lock is taken by default in the pre-doit - * operation, yet several commands do not require this. The global - * devlink lock is taken and protects from disruption by user-calls. - */ -#define DEVLINK_NL_FLAG_NO_LOCK BIT(4) +#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct devlink_linecard *linecard; struct devlink_port *devlink_port; struct devlink *devlink; int err; - mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs); - if (IS_ERR(devlink)) { - mutex_unlock(&devlink_mutex); + if (IS_ERR(devlink)) return PTR_ERR(devlink); - } - if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) - mutex_lock(&devlink->lock); + devl_lock(devlink); info->user_ptr[0] = devlink; if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { devlink_port = devlink_port_get_from_info(devlink, info); @@ -612,27 +811,35 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, goto unlock; } info->user_ptr[1] = rate_node; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { + linecard = devlink_linecard_get_from_info(devlink, info); + if (IS_ERR(linecard)) { + err = PTR_ERR(linecard); + goto unlock; + } + info->user_ptr[1] = linecard; } return 0; unlock: - if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); - mutex_unlock(&devlink_mutex); return err; } static void devlink_nl_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct devlink_linecard *linecard; struct devlink *devlink; devlink = info->user_ptr[0]; - if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) - mutex_unlock(&devlink->lock); + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) { + linecard = info->user_ptr[1]; + devlink_linecard_put(linecard); + } + devl_unlock(devlink); devlink_put(devlink); - mutex_unlock(&devlink_mutex); } static struct genl_family devlink_nl_family; @@ -654,6 +861,24 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } +static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink) +{ + struct nlattr *nested_attr; + + nested_attr = nla_nest_start(msg, DEVLINK_ATTR_NESTED_DEVLINK); + if (!nested_attr) + return -EMSGSIZE; + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + nla_nest_end(msg, nested_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, nested_attr); + return -EMSGSIZE; +} + struct devlink_reload_combination { enum devlink_reload_action action; enum devlink_reload_limit limit; @@ -1101,6 +1326,10 @@ static int devlink_nl_port_fill(struct sk_buff *msg, goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) goto nla_put_failure; + if (devlink_port->linecard && + nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, + devlink_port->linecard->index)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; @@ -1175,15 +1404,8 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, int idx = 0; int err = 0; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->lock); + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + devl_lock(devlink); list_for_each_entry(devlink_rate, &devlink->rate_list, list) { enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; u32 id = NETLINK_CB(cb->skb).portid; @@ -1196,18 +1418,16 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI, NULL); if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } idx++; } - mutex_unlock(&devlink->lock); -retry: + devl_unlock(devlink); devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); if (err != -EMSGSIZE) return err; @@ -1278,16 +1498,7 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, int idx = 0; int err; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) { - devlink_put(devlink); - continue; - } - + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { if (idx < start) { idx++; devlink_put(devlink); @@ -1303,8 +1514,6 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, idx++; } out: - mutex_unlock(&devlink_mutex); - cb->args[0] = idx; return msg->len; } @@ -1341,15 +1550,8 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, int idx = 0; int err; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->lock); + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + devl_lock(devlink); list_for_each_entry(devlink_port, &devlink->port_list, list) { if (idx < start) { idx++; @@ -1361,19 +1563,16 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } idx++; } - mutex_unlock(&devlink->lock); -retry: + devl_unlock(devlink); devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - cb->args[0] = idx; return msg->len; } @@ -1511,35 +1710,20 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, return 0; } -static int devlink_port_split(struct devlink *devlink, u32 port_index, - u32 count, struct netlink_ext_ack *extack) - -{ - if (devlink->ops->port_split) - return devlink->ops->port_split(devlink, port_index, count, - extack); - return -EOPNOTSUPP; -} - static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, struct genl_info *info) { + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; - struct devlink_port *devlink_port; - u32 port_index; u32 count; - if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] || - !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) return -EINVAL; + if (!devlink->ops->port_split) + return -EOPNOTSUPP; - devlink_port = devlink_port_get_from_info(devlink, info); - port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); - if (IS_ERR(devlink_port)) - return -EINVAL; - if (!devlink_port->attrs.splittable) { /* Split ports cannot be split. */ if (devlink_port->attrs.split) @@ -1554,34 +1738,24 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, return -EINVAL; } - return devlink_port_split(devlink, port_index, count, info->extack); -} - -static int devlink_port_unsplit(struct devlink *devlink, u32 port_index, - struct netlink_ext_ack *extack) - -{ - if (devlink->ops->port_unsplit) - return devlink->ops->port_unsplit(devlink, port_index, extack); - return -EOPNOTSUPP; + return devlink->ops->port_split(devlink, devlink_port, count, + info->extack); } static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info) { + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; - u32 port_index; - if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) - return -EINVAL; - - port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); - return devlink_port_unsplit(devlink, port_index, info->extack); + if (!devlink->ops->port_unsplit) + return -EOPNOTSUPP; + return devlink->ops->port_unsplit(devlink, devlink_port, info->extack); } -static int devlink_port_new_notifiy(struct devlink *devlink, - unsigned int port_index, - struct genl_info *info) +static int devlink_port_new_notify(struct devlink *devlink, + unsigned int port_index, + struct genl_info *info) { struct devlink_port *devlink_port; struct sk_buff *msg; @@ -1591,7 +1765,7 @@ static int devlink_port_new_notifiy(struct devlink *devlink, if (!msg) return -ENOMEM; - mutex_lock(&devlink->lock); + lockdep_assert_held(&devlink->lock); devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) { err = -ENODEV; @@ -1603,12 +1777,9 @@ static int devlink_port_new_notifiy(struct devlink *devlink, if (err) goto out; - err = genlmsg_reply(msg, info); - mutex_unlock(&devlink->lock); - return err; + return genlmsg_reply(msg, info); out: - mutex_unlock(&devlink->lock); nlmsg_free(msg); return err; } @@ -1656,7 +1827,7 @@ static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb, if (err) return err; - err = devlink_port_new_notifiy(devlink, new_port_index, info); + err = devlink_port_new_notify(devlink, new_port_index, info); if (err && err != -ENODEV) { /* Fail to send the response; destroy newly created port. */ devlink->ops->port_del(devlink, new_port_index, extack); @@ -1674,7 +1845,7 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, if (!devlink->ops->port_del) return -EOPNOTSUPP; - if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_INDEX)) { NL_SET_ERR_MSG_MOD(extack, "Port index is not specified"); return -EINVAL; } @@ -1932,6 +2103,316 @@ static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb, return err; } +struct devlink_linecard_type { + const char *type; + const void *priv; +}; + +static int devlink_nl_linecard_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_linecard *linecard, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, + struct netlink_ext_ack *extack) +{ + struct devlink_linecard_type *linecard_type; + struct nlattr *attr; + void *hdr; + int i; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state)) + goto nla_put_failure; + if (linecard->type && + nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type)) + goto nla_put_failure; + + if (linecard->types_count) { + attr = nla_nest_start(msg, + DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES); + if (!attr) + goto nla_put_failure; + for (i = 0; i < linecard->types_count; i++) { + linecard_type = &linecard->types[i]; + if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, + linecard_type->type)) { + nla_nest_cancel(msg, attr); + goto nla_put_failure; + } + } + nla_nest_end(msg, attr); + } + + if (linecard->nested_devlink && + devlink_nl_put_nested_handle(msg, linecard->nested_devlink)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static void devlink_linecard_notify(struct devlink_linecard *linecard, + enum devlink_command cmd) +{ + struct devlink *devlink = linecard->devlink; + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW && + cmd != DEVLINK_CMD_LINECARD_DEL); + + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0, + NULL); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_linecard *linecard = info->user_ptr[1]; + struct devlink *devlink = linecard->devlink; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + info->snd_portid, info->snd_seq, 0, + info->extack); + mutex_unlock(&linecard->state_lock); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_linecard *linecard; + struct devlink *devlink; + int start = cb->args[0]; + unsigned long index; + int idx = 0; + int err; + + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + mutex_lock(&devlink->linecards_lock); + list_for_each_entry(linecard, &devlink->linecard_list, list) { + if (idx < start) { + idx++; + continue; + } + mutex_lock(&linecard->state_lock); + err = devlink_nl_linecard_fill(msg, devlink, linecard, + DEVLINK_CMD_LINECARD_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + cb->extack); + mutex_unlock(&linecard->state_lock); + if (err) { + mutex_unlock(&devlink->linecards_lock); + devlink_put(devlink); + goto out; + } + idx++; + } + mutex_unlock(&devlink->linecards_lock); + devlink_put(devlink); + } +out: + cb->args[0] = idx; + return msg->len; +} + +static struct devlink_linecard_type * +devlink_linecard_type_lookup(struct devlink_linecard *linecard, + const char *type) +{ + struct devlink_linecard_type *linecard_type; + int i; + + for (i = 0; i < linecard->types_count; i++) { + linecard_type = &linecard->types[i]; + if (!strcmp(type, linecard_type->type)) + return linecard_type; + } + return NULL; +} + +static int devlink_linecard_type_set(struct devlink_linecard *linecard, + const char *type, + struct netlink_ext_ack *extack) +{ + const struct devlink_linecard_ops *ops = linecard->ops; + struct devlink_linecard_type *linecard_type; + int err; + + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; + } + + linecard_type = devlink_linecard_type_lookup(linecard, type); + if (!linecard_type) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported line card type provided"); + err = -EINVAL; + goto out; + } + + if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED && + linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + NL_SET_ERR_MSG_MOD(extack, "Line card already provisioned"); + err = -EBUSY; + /* Check if the line card is provisioned in the same + * way the user asks. In case it is, make the operation + * to return success. + */ + if (ops->same_provision && + ops->same_provision(linecard, linecard->priv, + linecard_type->type, + linecard_type->priv)) + err = 0; + goto out; + } + + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING; + linecard->type = linecard_type->type; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = ops->provision(linecard, linecard->priv, linecard_type->type, + linecard_type->priv, extack); + if (err) { + /* Provisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + } + return err; + +out: + mutex_unlock(&linecard->state_lock); + return err; +} + +static int devlink_linecard_type_unset(struct devlink_linecard *linecard, + struct netlink_ext_ack *extack) +{ + int err; + + mutex_lock(&linecard->state_lock); + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) { + NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) { + NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned"); + err = -EBUSY; + goto out; + } + if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) { + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + err = 0; + goto out; + } + + if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) { + NL_SET_ERR_MSG_MOD(extack, "Line card is not provisioned"); + err = 0; + goto out; + } + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + err = linecard->ops->unprovision(linecard, linecard->priv, + extack); + if (err) { + /* Unprovisioning failed. Assume the linecard is unprovisioned + * for future operations. + */ + mutex_lock(&linecard->state_lock); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); + } + return err; + +out: + mutex_unlock(&linecard->state_lock); + return err; +} + +static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_linecard *linecard = info->user_ptr[1]; + struct netlink_ext_ack *extack = info->extack; + int err; + + if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) { + const char *type; + + type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]); + if (strcmp(type, "")) { + err = devlink_linecard_type_set(linecard, type, extack); + if (err) + return err; + } else { + err = devlink_linecard_type_unset(linecard, extack); + if (err) + return err; + } + } + + return 0; +} + static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -2007,15 +2488,8 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, int idx = 0; int err; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->lock); + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + devl_lock(devlink); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (idx < start) { idx++; @@ -2027,19 +2501,16 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } idx++; } - mutex_unlock(&devlink->lock); -retry: + devl_unlock(devlink); devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - cb->args[0] = idx; return msg->len; } @@ -2159,16 +2630,11 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, int idx = 0; int err = 0; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || - !devlink->ops->sb_pool_get) + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + if (!devlink->ops->sb_pool_get) goto retry; - mutex_lock(&devlink->lock); + devl_lock(devlink); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, @@ -2177,18 +2643,16 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, if (err == -EOPNOTSUPP) { err = 0; } else if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } } - mutex_unlock(&devlink->lock); + devl_unlock(devlink); retry: devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - if (err != -EMSGSIZE) return err; @@ -2233,7 +2697,7 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, if (err) return err; - if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE)) return -EINVAL; size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); @@ -2380,16 +2844,11 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, int idx = 0; int err = 0; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || - !devlink->ops->sb_port_pool_get) + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + if (!devlink->ops->sb_port_pool_get) goto retry; - mutex_lock(&devlink->lock); + devl_lock(devlink); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_port_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, @@ -2398,18 +2857,16 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, if (err == -EOPNOTSUPP) { err = 0; } else if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } } - mutex_unlock(&devlink->lock); + devl_unlock(devlink); retry: devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - if (err != -EMSGSIZE) return err; @@ -2450,7 +2907,7 @@ static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, if (err) return err; - if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) return -EINVAL; threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); @@ -2629,16 +3086,11 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, int idx = 0; int err = 0; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || - !devlink->ops->sb_tc_pool_bind_get) + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + if (!devlink->ops->sb_tc_pool_bind_get) goto retry; - mutex_lock(&devlink->lock); + devl_lock(devlink); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, devlink, @@ -2648,18 +3100,16 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, if (err == -EOPNOTSUPP) { err = 0; } else if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } } - mutex_unlock(&devlink->lock); + devl_unlock(devlink); retry: devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - if (err != -EMSGSIZE) return err; @@ -2713,7 +3163,7 @@ static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, if (err) return err; - if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) return -EINVAL; threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); @@ -2836,15 +3286,11 @@ static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, { struct devlink_rate *devlink_rate; - /* Take the lock to sync with devlink_rate_nodes_destroy() */ - mutex_lock(&devlink->lock); list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { - mutex_unlock(&devlink->lock); NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists."); return -EBUSY; } - mutex_unlock(&devlink->lock); return 0; } @@ -3406,7 +3852,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, struct devlink_dpipe_table *table; const char *table_name; - if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME)) return -EINVAL; table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); @@ -3590,8 +4036,9 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, const char *table_name; bool counters_enable; - if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME] || - !info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) || + GENL_REQ_ATTR_CHECK(info, + DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED)) return -EINVAL; table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); @@ -3680,8 +4127,8 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb, u64 size; int err; - if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] || - !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) || + GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE)) return -EINVAL; resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); @@ -4303,10 +4750,76 @@ void devlink_flash_update_timeout_notify(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); +struct devlink_info_req { + struct sk_buff *msg; + void (*version_cb)(const char *version_name, + enum devlink_info_version_type version_type, + void *version_cb_priv); + void *version_cb_priv; +}; + +struct devlink_flash_component_lookup_ctx { + const char *lookup_name; + bool lookup_name_found; +}; + +static void +devlink_flash_component_lookup_cb(const char *version_name, + enum devlink_info_version_type version_type, + void *version_cb_priv) +{ + struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv; + + if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT || + lookup_ctx->lookup_name_found) + return; + + lookup_ctx->lookup_name_found = + !strcmp(lookup_ctx->lookup_name, version_name); +} + +static int devlink_flash_component_get(struct devlink *devlink, + struct nlattr *nla_component, + const char **p_component, + struct netlink_ext_ack *extack) +{ + struct devlink_flash_component_lookup_ctx lookup_ctx = {}; + struct devlink_info_req req = {}; + const char *component; + int ret; + + if (!nla_component) + return 0; + + component = nla_data(nla_component); + + if (!devlink->ops->info_get) { + NL_SET_ERR_MSG_ATTR(extack, nla_component, + "component update is not supported by this device"); + return -EOPNOTSUPP; + } + + lookup_ctx.lookup_name = component; + req.version_cb = devlink_flash_component_lookup_cb; + req.version_cb_priv = &lookup_ctx; + + ret = devlink->ops->info_get(devlink, &req, NULL); + if (ret) + return ret; + + if (!lookup_ctx.lookup_name_found) { + NL_SET_ERR_MSG_ATTR(extack, nla_component, + "selected component is not supported by this device"); + return -EINVAL; + } + *p_component = component; + return 0; +} + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *nla_component, *nla_overwrite_mask, *nla_file_name; + struct nlattr *nla_overwrite_mask, *nla_file_name; struct devlink_flash_update_params params = {}; struct devlink *devlink = info->user_ptr[0]; const char *file_name; @@ -4316,20 +4829,16 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, if (!devlink->ops->flash_update) return -EOPNOTSUPP; - if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME)) return -EINVAL; - supported_params = devlink->ops->supported_flash_update_params; + ret = devlink_flash_component_get(devlink, + info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT], + ¶ms.component, info->extack); + if (ret) + return ret; - nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]; - if (nla_component) { - if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT)) { - NL_SET_ERR_MSG_ATTR(info->extack, nla_component, - "component update is not supported by this device"); - return -EOPNOTSUPP; - } - params.component = nla_data(nla_component); - } + supported_params = devlink->ops->supported_flash_update_params; nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; if (nla_overwrite_mask) { @@ -4361,6 +4870,202 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, return ret; } +static int +devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink, + u32 portid, u32 seq, int flags, + struct netlink_ext_ack *extack) +{ + struct nlattr *selftests; + void *hdr; + int err; + int i; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, + DEVLINK_CMD_SELFTESTS_GET); + if (!hdr) + return -EMSGSIZE; + + err = -EMSGSIZE; + if (devlink_nl_put_handle(msg, devlink)) + goto err_cancel_msg; + + selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); + if (!selftests) + goto err_cancel_msg; + + for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; + i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { + if (devlink->ops->selftest_check(devlink, i, extack)) { + err = nla_put_flag(msg, i); + if (err) + goto err_cancel_msg; + } + } + + nla_nest_end(msg, selftests); + genlmsg_end(msg, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(msg, hdr); + return err; +} + +static int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + if (!devlink->ops->selftest_check) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid, + info->snd_seq, 0, info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_selftests_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + int start = cb->args[0]; + unsigned long index; + int idx = 0; + int err = 0; + + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + if (idx < start || !devlink->ops->selftest_check) + goto inc; + + devl_lock(devlink); + err = devlink_nl_selftests_fill(msg, devlink, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); + devl_unlock(devlink); + if (err) { + devlink_put(devlink); + break; + } +inc: + idx++; + devlink_put(devlink); + } + + if (err != -EMSGSIZE) + return err; + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id, + enum devlink_selftest_status test_status) +{ + struct nlattr *result_attr; + + result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT); + if (!result_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) || + nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS, + test_status)) + goto nla_put_failure; + + nla_nest_end(skb, result_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, result_attr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_selftests_run(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1]; + struct devlink *devlink = info->user_ptr[0]; + struct nlattr *attrs, *selftests; + struct sk_buff *msg; + void *hdr; + int err; + int i; + + if (!devlink->ops->selftest_run || !devlink->ops->selftest_check) + return -EOPNOTSUPP; + + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS)) + return -EINVAL; + + attrs = info->attrs[DEVLINK_ATTR_SELFTESTS]; + + err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs, + devlink_selftest_nl_policy, info->extack); + if (err < 0) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = -EMSGSIZE; + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN); + if (!hdr) + goto free_msg; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS); + if (!selftests) + goto genlmsg_cancel; + + for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1; + i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) { + enum devlink_selftest_status test_status; + + if (nla_get_flag(tb[i])) { + if (!devlink->ops->selftest_check(devlink, i, + info->extack)) { + if (devlink_selftest_result_put(msg, i, + DEVLINK_SELFTEST_STATUS_SKIP)) + goto selftests_nest_cancel; + continue; + } + + test_status = devlink->ops->selftest_run(devlink, i, + info->extack); + if (devlink_selftest_result_put(msg, i, test_status)) + goto selftests_nest_cancel; + } + } + + nla_nest_end(msg, selftests); + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +selftests_nest_cancel: + nla_nest_cancel(msg, selftests); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return err; +} + static const struct devlink_param devlink_param_generic[] = { { .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, @@ -4432,6 +5137,21 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, + .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, + .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME, + .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, + .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -4495,7 +5215,7 @@ static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->get) + if (!param->get || devlink->reload_failed) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } @@ -4504,7 +5224,7 @@ static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->set) + if (!param->set || devlink->reload_failed) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx); } @@ -4705,15 +5425,8 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, int idx = 0; int err = 0; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->lock); + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + devl_lock(devlink); list_for_each_entry(param_item, &devlink->param_list, list) { if (idx < start) { idx++; @@ -4727,19 +5440,16 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, if (err == -EOPNOTSUPP) { err = 0; } else if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } idx++; } - mutex_unlock(&devlink->lock); -retry: + devl_unlock(devlink); devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - if (err != -EMSGSIZE) return err; @@ -4751,7 +5461,7 @@ static int devlink_param_type_get_from_info(struct genl_info *info, enum devlink_param_type *param_type) { - if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) return -EINVAL; switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { @@ -4828,7 +5538,7 @@ devlink_param_get_from_info(struct list_head *param_list, { char *param_name; - if (!info->attrs[DEVLINK_ATTR_PARAM_NAME]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME)) return NULL; param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); @@ -4894,7 +5604,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return err; } - if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) return -EINVAL; cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); if (!devlink_param_cmode_is_supported(param, cmode)) @@ -4932,99 +5642,22 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { - struct devlink_param_item *param_item; - struct devlink_port *devlink_port; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->lock); - list_for_each_entry(devlink_port, &devlink->port_list, list) { - list_for_each_entry(param_item, - &devlink_port->param_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_param_fill(msg, - devlink_port->devlink, - devlink_port->index, param_item, - DEVLINK_CMD_PORT_PARAM_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - mutex_unlock(&devlink->lock); - devlink_put(devlink); - goto out; - } - idx++; - } - } - mutex_unlock(&devlink->lock); -retry: - devlink_put(devlink); - } -out: - mutex_unlock(&devlink_mutex); - - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; + NL_SET_ERR_MSG_MOD(cb->extack, "Port params are not supported"); return msg->len; } static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink_param_item *param_item; - struct sk_buff *msg; - int err; - - param_item = devlink_param_get_from_info(&devlink_port->param_list, - info); - if (!param_item) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_param_fill(msg, devlink_port->devlink, - devlink_port->index, param_item, - DEVLINK_CMD_PORT_PARAM_GET, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); + NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported"); + return -EINVAL; } static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[1]; - - return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, - devlink_port->index, - &devlink_port->param_list, info, - DEVLINK_CMD_PORT_PARAM_NEW); + NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported"); + return -EINVAL; } static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, @@ -5224,21 +5857,28 @@ static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id) { unsigned long count; void *p; + int err; - lockdep_assert_held(&devlink->lock); - + xa_lock(&devlink->snapshot_ids); p = xa_load(&devlink->snapshot_ids, id); - if (WARN_ON(!p)) - return -EINVAL; + if (WARN_ON(!p)) { + err = -EINVAL; + goto unlock; + } - if (WARN_ON(!xa_is_value(p))) - return -EINVAL; + if (WARN_ON(!xa_is_value(p))) { + err = -EINVAL; + goto unlock; + } count = xa_to_value(p); count++; - return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), - GFP_KERNEL)); + err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), + GFP_ATOMIC)); +unlock: + xa_unlock(&devlink->snapshot_ids); + return err; } /** @@ -5261,25 +5901,26 @@ static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) unsigned long count; void *p; - lockdep_assert_held(&devlink->lock); - + xa_lock(&devlink->snapshot_ids); p = xa_load(&devlink->snapshot_ids, id); if (WARN_ON(!p)) - return; + goto unlock; if (WARN_ON(!xa_is_value(p))) - return; + goto unlock; count = xa_to_value(p); if (count > 1) { count--; - xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), - GFP_KERNEL); + __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), + GFP_ATOMIC); } else { /* If this was the last user, we can erase this id */ - xa_erase(&devlink->snapshot_ids, id); + __xa_erase(&devlink->snapshot_ids, id); } +unlock: + xa_unlock(&devlink->snapshot_ids); } /** @@ -5300,13 +5941,17 @@ static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) */ static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) { - lockdep_assert_held(&devlink->lock); + int err; - if (xa_load(&devlink->snapshot_ids, id)) + xa_lock(&devlink->snapshot_ids); + if (xa_load(&devlink->snapshot_ids, id)) { + xa_unlock(&devlink->snapshot_ids); return -EEXIST; - - return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), - GFP_KERNEL)); + } + err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), + GFP_ATOMIC)); + xa_unlock(&devlink->snapshot_ids); + return err; } /** @@ -5327,8 +5972,6 @@ static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) */ static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { - lockdep_assert_held(&devlink->lock); - return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1), xa_limit_32b, GFP_KERNEL); } @@ -5341,7 +5984,7 @@ static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) * Multiple snapshots can be created on a region. * The @snapshot_id should be obtained using the getter function. * - * Must be called only while holding the devlink instance lock. + * Must be called only while holding the region snapshot lock. * * @region: devlink region of the snapshot * @data: snapshot data @@ -5355,7 +5998,7 @@ __devlink_region_snapshot_create(struct devlink_region *region, struct devlink_snapshot *snapshot; int err; - lockdep_assert_held(&devlink->lock); + lockdep_assert_held(®ion->snapshot_lock); /* check if region can hold one more snapshot */ if (region->cur_snapshots == region->max_snapshots) @@ -5393,7 +6036,7 @@ static void devlink_region_snapshot_del(struct devlink_region *region, { struct devlink *devlink = region->devlink; - lockdep_assert_held(&devlink->lock); + lockdep_assert_held(®ion->snapshot_lock); devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); region->cur_snapshots--; @@ -5414,7 +6057,7 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, unsigned int index; int err; - if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) return -EINVAL; if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { @@ -5487,7 +6130,7 @@ static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, struct devlink_port *port; int err = 0; - mutex_lock(&devlink->lock); + devl_lock(devlink); list_for_each_entry(region, &devlink->region_list, list) { if (*idx < start) { (*idx)++; @@ -5511,7 +6154,7 @@ static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, } out: - mutex_unlock(&devlink->lock); + devl_unlock(devlink); return err; } @@ -5524,23 +6167,14 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, int idx = 0; int err = 0; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink, &idx, start); -retry: devlink_put(devlink); if (err) goto out; } out: - mutex_unlock(&devlink_mutex); cb->args[0] = idx; return msg->len; } @@ -5556,8 +6190,8 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb, unsigned int index; u32 snapshot_id; - if (!info->attrs[DEVLINK_ATTR_REGION_NAME] || - !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) || + GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID)) return -EINVAL; region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); @@ -5579,11 +6213,15 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb, if (!region) return -EINVAL; + mutex_lock(®ion->snapshot_lock); snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); - if (!snapshot) + if (!snapshot) { + mutex_unlock(®ion->snapshot_lock); return -EINVAL; + } devlink_region_snapshot_del(region, snapshot); + mutex_unlock(®ion->snapshot_lock); return 0; } @@ -5601,7 +6239,7 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) u8 *data; int err; - if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) { + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) { NL_SET_ERR_MSG_MOD(info->extack, "No region name provided"); return -EINVAL; } @@ -5631,9 +6269,12 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } + mutex_lock(®ion->snapshot_lock); + if (region->cur_snapshots == region->max_snapshots) { NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots"); - return -ENOSPC; + err = -ENOSPC; + goto unlock; } snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; @@ -5642,17 +6283,18 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); - return -EEXIST; + err = -EEXIST; + goto unlock; } err = __devlink_snapshot_id_insert(devlink, snapshot_id); if (err) - return err; + goto unlock; } else { err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); if (err) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id"); - return err; + goto unlock; } } @@ -5674,8 +6316,10 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); - if (WARN_ON(!snapshot)) - return -EINVAL; + if (WARN_ON(!snapshot)) { + err = -EINVAL; + goto unlock; + } msg = devlink_nl_region_notify_build(region, snapshot, DEVLINK_CMD_REGION_NEW, @@ -5690,16 +6334,20 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) goto err_notify; } + mutex_unlock(®ion->snapshot_lock); return 0; err_snapshot_create: region->ops->destructor(data); err_snapshot_capture: __devlink_snapshot_id_decrement(devlink, snapshot_id); + mutex_unlock(®ion->snapshot_lock); return err; err_notify: devlink_region_snapshot_del(region, snapshot); +unlock: + mutex_unlock(®ion->snapshot_lock); return err; } @@ -5794,14 +6442,11 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, start_offset = *((u64 *)&cb->args[0]); - mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); - if (IS_ERR(devlink)) { - err = PTR_ERR(devlink); - goto out_dev; - } + if (IS_ERR(devlink)) + return PTR_ERR(devlink); - mutex_lock(&devlink->lock); + devl_lock(devlink); if (!attrs[DEVLINK_ATTR_REGION_NAME] || !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { @@ -5897,34 +6542,30 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); - mutex_unlock(&devlink_mutex); - return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); out_unlock: - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); -out_dev: - mutex_unlock(&devlink_mutex); return err; } -struct devlink_info_req { - struct sk_buff *msg; -}; - int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) { + if (!req->msg) + return 0; return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name); } EXPORT_SYMBOL_GPL(devlink_info_driver_name_put); int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) { + if (!req->msg) + return 0; return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); } EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); @@ -5932,6 +6573,8 @@ EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); int devlink_info_board_serial_number_put(struct devlink_info_req *req, const char *bsn) { + if (!req->msg) + return 0; return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, bsn); } @@ -5939,11 +6582,19 @@ EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); static int devlink_info_version_put(struct devlink_info_req *req, int attr, const char *version_name, - const char *version_value) + const char *version_value, + enum devlink_info_version_type version_type) { struct nlattr *nest; int err; + if (req->version_cb) + req->version_cb(version_name, version_type, + req->version_cb_priv); + + if (!req->msg) + return 0; + nest = nla_nest_start_noflag(req->msg, attr); if (!nest) return -EMSGSIZE; @@ -5972,7 +6623,8 @@ int devlink_info_version_fixed_put(struct devlink_info_req *req, const char *version_value) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, - version_name, version_value); + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); } EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); @@ -5981,25 +6633,49 @@ int devlink_info_version_stored_put(struct devlink_info_req *req, const char *version_value) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, - version_name, version_value); + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); } EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); +int devlink_info_version_stored_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, + version_name, version_value, + version_type); +} +EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext); + int devlink_info_version_running_put(struct devlink_info_req *req, const char *version_name, const char *version_value) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, - version_name, version_value); + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); } EXPORT_SYMBOL_GPL(devlink_info_version_running_put); +int devlink_info_version_running_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, + version_name, version_value, + version_type); +} +EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext); + static int devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { - struct devlink_info_req req; + struct devlink_info_req req = {}; void *hdr; int err; @@ -6058,23 +6734,16 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, int idx = 0; int err = 0; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { if (idx < start || !devlink->ops->info_get) goto inc; - mutex_lock(&devlink->lock); + devl_lock(devlink); err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); - mutex_unlock(&devlink->lock); + devl_unlock(devlink); if (err == -EOPNOTSUPP) err = 0; else if (err) { @@ -6083,10 +6752,8 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, } inc: idx++; -retry: devlink_put(devlink); } - mutex_unlock(&devlink_mutex); if (err != -EMSGSIZE) return err; @@ -7075,6 +7742,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state prev_health_state; struct devlink *devlink = reporter->devlink; unsigned long recover_ts_threshold; + int ret; /* write a log message of the current error */ WARN_ON(!msg); @@ -7108,11 +7776,14 @@ int devlink_health_report(struct devlink_health_reporter *reporter, mutex_unlock(&reporter->dump_lock); } - if (reporter->auto_recover) - return devlink_health_reporter_recover(reporter, - priv_ctx, NULL); + if (!reporter->auto_recover) + return 0; - return 0; + devl_lock(devlink); + ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL); + devl_unlock(devlink); + + return ret; } EXPORT_SYMBOL_GPL(devlink_health_report); @@ -7161,18 +7832,13 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) struct nlattr **attrs = info->attrs; struct devlink *devlink; - mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) - goto unlock; + return NULL; reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); devlink_put(devlink); - mutex_unlock(&devlink_mutex); return reporter; -unlock: - mutex_unlock(&devlink_mutex); - return NULL; } void @@ -7238,14 +7904,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, int idx = 0; int err; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry_rep; - + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { mutex_lock(&devlink->reporters_lock); list_for_each_entry(reporter, &devlink->reporter_list, list) { @@ -7265,18 +7924,11 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, idx++; } mutex_unlock(&devlink->reporters_lock); -retry_rep: devlink_put(devlink); } - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry_port; - - mutex_lock(&devlink->lock); + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + devl_lock(devlink); list_for_each_entry(port, &devlink->port_list, list) { mutex_lock(&port->reporters_lock); list_for_each_entry(reporter, &port->reporter_list, list) { @@ -7291,7 +7943,7 @@ retry_rep: cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) { mutex_unlock(&port->reporters_lock); - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } @@ -7299,13 +7951,10 @@ retry_rep: } mutex_unlock(&port->reporters_lock); } - mutex_unlock(&devlink->lock); -retry_port: + devl_unlock(devlink); devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - cb->args[0] = idx; return msg->len; } @@ -7498,8 +8147,8 @@ static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, } struct devlink_stats { - u64 rx_bytes; - u64 rx_packets; + u64_stats_t rx_bytes; + u64_stats_t rx_packets; struct u64_stats_sync syncp; }; @@ -7656,12 +8305,12 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, cpu_stats = per_cpu_ptr(trap_stats, i); do { start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); - rx_packets = cpu_stats->rx_packets; - rx_bytes = cpu_stats->rx_bytes; + rx_packets = u64_stats_read(&cpu_stats->rx_packets); + rx_bytes = u64_stats_read(&cpu_stats->rx_bytes); } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); - stats->rx_packets += rx_packets; - stats->rx_bytes += rx_bytes; + u64_stats_add(&stats->rx_packets, rx_packets); + u64_stats_add(&stats->rx_bytes, rx_bytes); } } @@ -7679,11 +8328,13 @@ devlink_trap_group_stats_put(struct sk_buff *msg, return -EMSGSIZE; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, - stats.rx_packets, DEVLINK_ATTR_PAD)) + u64_stats_read(&stats.rx_packets), + DEVLINK_ATTR_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, - stats.rx_bytes, DEVLINK_ATTR_PAD)) + u64_stats_read(&stats.rx_bytes), + DEVLINK_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); @@ -7723,11 +8374,13 @@ static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, - stats.rx_packets, DEVLINK_ATTR_PAD)) + u64_stats_read(&stats.rx_packets), + DEVLINK_ATTR_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, - stats.rx_bytes, DEVLINK_ATTR_PAD)) + u64_stats_read(&stats.rx_bytes), + DEVLINK_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); @@ -7834,15 +8487,8 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, int idx = 0; int err; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->lock); + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + devl_lock(devlink); list_for_each_entry(trap_item, &devlink->trap_list, list) { if (idx < start) { idx++; @@ -7854,19 +8500,16 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } idx++; } - mutex_unlock(&devlink->lock); -retry: + devl_unlock(devlink); devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - cb->args[0] = idx; return msg->len; } @@ -8061,15 +8704,8 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, int idx = 0; int err; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->lock); + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + devl_lock(devlink); list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (idx < start) { @@ -8082,19 +8718,16 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } idx++; } - mutex_unlock(&devlink->lock); -retry: + devl_unlock(devlink); devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - cb->args[0] = idx; return msg->len; } @@ -8375,15 +9008,8 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, int idx = 0; int err; - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->lock); + devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { + devl_lock(devlink); list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { if (idx < start) { @@ -8396,19 +9022,16 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) { - mutex_unlock(&devlink->lock); + devl_unlock(devlink); devlink_put(devlink); goto out; } idx++; } - mutex_unlock(&devlink->lock); -retry: + devl_unlock(devlink); devlink_put(devlink); } out: - mutex_unlock(&devlink_mutex); - cb->args[0] = idx; return msg->len; } @@ -8546,6 +9169,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -8600,26 +9226,37 @@ static const struct genl_small_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_split_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_port_unsplit_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, }, { .cmd = DEVLINK_CMD_PORT_NEW, .doit = devlink_nl_cmd_port_new_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PORT_DEL, .doit = devlink_nl_cmd_port_del_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, + }, + { + .cmd = DEVLINK_CMD_LINECARD_GET, + .doit = devlink_nl_cmd_linecard_get_doit, + .dumpit = devlink_nl_cmd_linecard_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_LINECARD_SET, + .doit = devlink_nl_cmd_linecard_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, }, { .cmd = DEVLINK_CMD_SB_GET, @@ -8688,14 +9325,12 @@ static const struct genl_small_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_get_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_ESWITCH_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, @@ -8738,7 +9373,6 @@ static const struct genl_small_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_reload, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PARAM_GET, @@ -8806,8 +9440,7 @@ static const struct genl_small_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_get_doit, .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, /* can be retrieved by unprivileged users */ }, { @@ -8815,24 +9448,21 @@ static const struct genl_small_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_set_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_recover_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_diagnose_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, @@ -8840,24 +9470,20 @@ static const struct genl_small_ops devlink_nl_ops[] = { GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | - DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_health_reporter_test_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | - DEVLINK_NL_FLAG_NO_LOCK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT, }, { .cmd = DEVLINK_CMD_FLASH_UPDATE, @@ -8898,6 +9524,17 @@ static const struct genl_small_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_trap_policer_set_doit, .flags = GENL_ADMIN_PERM, }, + { + .cmd = DEVLINK_CMD_SELFTESTS_GET, + .doit = devlink_nl_cmd_selftests_get_doit, + .dumpit = devlink_nl_cmd_selftests_get_dumpit + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_SELFTESTS_RUN, + .doit = devlink_nl_cmd_selftests_run, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -8906,11 +9543,13 @@ static struct genl_family devlink_nl_family __ro_after_init = { .maxattr = DEVLINK_ATTR_MAX, .policy = devlink_nl_policy, .netnsok = true, + .parallel_ops = true, .pre_doit = devlink_nl_pre_doit, .post_doit = devlink_nl_post_doit, .module = THIS_MODULE, .small_ops = devlink_nl_ops, .n_small_ops = ARRAY_SIZE(devlink_nl_ops), + .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, .mcgrps = devlink_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), }; @@ -9004,6 +9643,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->rate_list); + INIT_LIST_HEAD(&devlink->linecard_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); @@ -9013,8 +9653,11 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, INIT_LIST_HEAD(&devlink->trap_list); INIT_LIST_HEAD(&devlink->trap_group_list); INIT_LIST_HEAD(&devlink->trap_policer_list); + lockdep_register_key(&devlink->lock_key); mutex_init(&devlink->lock); + lockdep_set_class(&devlink->lock, &devlink->lock_key); mutex_init(&devlink->reporters_lock); + mutex_init(&devlink->linecards_lock); refcount_set(&devlink->refcount, 1); init_completion(&devlink->comp); @@ -9041,10 +9684,14 @@ static void devlink_notify_register(struct devlink *devlink) struct devlink_param_item *param_item; struct devlink_trap_item *trap_item; struct devlink_port *devlink_port; + struct devlink_linecard *linecard; struct devlink_rate *rate_node; struct devlink_region *region; devlink_notify(devlink, DEVLINK_CMD_NEW); + list_for_each_entry(linecard, &devlink->linecard_list, list) + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + list_for_each_entry(devlink_port, &devlink->port_list, list) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); @@ -9116,10 +9763,8 @@ void devlink_register(struct devlink *devlink) ASSERT_DEVLINK_NOT_REGISTERED(devlink); /* Make sure that we are in .probe() routine */ - mutex_lock(&devlink_mutex); xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_notify_register(devlink); - mutex_unlock(&devlink_mutex); } EXPORT_SYMBOL_GPL(devlink_register); @@ -9133,13 +9778,13 @@ void devlink_unregister(struct devlink *devlink) ASSERT_DEVLINK_REGISTERED(devlink); /* Make sure that we are in .remove() routine */ + xa_set_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); devlink_put(devlink); wait_for_completion(&devlink->comp); - mutex_lock(&devlink_mutex); devlink_notify_unregister(devlink); xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - mutex_unlock(&devlink_mutex); + xa_clear_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING); } EXPORT_SYMBOL_GPL(devlink_unregister); @@ -9152,8 +9797,10 @@ void devlink_free(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); + mutex_destroy(&devlink->linecards_lock); mutex_destroy(&devlink->reporters_lock); mutex_destroy(&devlink->lock); + lockdep_unregister_key(&devlink->lock_key); WARN_ON(!list_empty(&devlink->trap_policer_list)); WARN_ON(!list_empty(&devlink->trap_group_list)); WARN_ON(!list_empty(&devlink->trap_list)); @@ -9164,6 +9811,7 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!list_empty(&devlink->port_list)); xa_destroy(&devlink->snapshot_ids); @@ -9207,6 +9855,83 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) } /** + * devlink_port_init() - Init devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * + * Initialize essencial stuff that is needed for functions + * that may be called before devlink port registration. + * Call to this function is optional and not needed + * in case the driver does not use such functions. + */ +void devlink_port_init(struct devlink *devlink, + struct devlink_port *devlink_port) +{ + if (devlink_port->initialized) + return; + devlink_port->devlink = devlink; + INIT_LIST_HEAD(&devlink_port->region_list); + devlink_port->initialized = true; +} +EXPORT_SYMBOL_GPL(devlink_port_init); + +/** + * devlink_port_fini() - Deinitialize devlink port + * + * @devlink_port: devlink port + * + * Deinitialize essencial stuff that is in use for functions + * that may be called after devlink port unregistration. + * Call to this function is optional and not needed + * in case the driver does not use such functions. + */ +void devlink_port_fini(struct devlink_port *devlink_port) +{ + WARN_ON(!list_empty(&devlink_port->region_list)); +} +EXPORT_SYMBOL_GPL(devlink_port_fini); + +/** + * devl_port_register() - Register devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * @port_index: driver-specific numerical identifier of the port + * + * Register devlink port with provided port index. User can use + * any indexing, even hw-related one. devlink_port structure + * is convenient to be embedded inside user driver private structure. + * Note that the caller should take care of zeroing the devlink_port + * structure. + */ +int devl_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) +{ + devl_assert_locked(devlink); + + if (devlink_port_index_exists(devlink, port_index)) + return -EEXIST; + + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + devlink_port_init(devlink, devlink_port); + devlink_port->registered = true; + devlink_port->index = port_index; + spin_lock_init(&devlink_port->type_lock); + INIT_LIST_HEAD(&devlink_port->reporter_list); + mutex_init(&devlink_port->reporters_lock); + list_add_tail(&devlink_port->list, &devlink->port_list); + + INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); + devlink_port_type_warn_schedule(devlink_port); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; +} +EXPORT_SYMBOL_GPL(devl_port_register); + +/** * devlink_port_register - Register devlink port * * @devlink: devlink @@ -9218,51 +9943,54 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. + * + * Context: Takes and release devlink->lock <mutex>. */ int devlink_port_register(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index) { - mutex_lock(&devlink->lock); - if (devlink_port_index_exists(devlink, port_index)) { - mutex_unlock(&devlink->lock); - return -EEXIST; - } + int err; - WARN_ON(devlink_port->devlink); - devlink_port->devlink = devlink; - devlink_port->index = port_index; - spin_lock_init(&devlink_port->type_lock); - INIT_LIST_HEAD(&devlink_port->reporter_list); - mutex_init(&devlink_port->reporters_lock); - list_add_tail(&devlink_port->list, &devlink->port_list); - INIT_LIST_HEAD(&devlink_port->param_list); - INIT_LIST_HEAD(&devlink_port->region_list); - mutex_unlock(&devlink->lock); - INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); - devlink_port_type_warn_schedule(devlink_port); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); - return 0; + devl_lock(devlink); + err = devl_port_register(devlink, devlink_port, port_index); + devl_unlock(devlink); + return err; } EXPORT_SYMBOL_GPL(devlink_port_register); /** - * devlink_port_unregister - Unregister devlink port + * devl_port_unregister() - Unregister devlink port * - * @devlink_port: devlink port + * @devlink_port: devlink port */ -void devlink_port_unregister(struct devlink_port *devlink_port) +void devl_port_unregister(struct devlink_port *devlink_port) { - struct devlink *devlink = devlink_port->devlink; + lockdep_assert_held(&devlink_port->devlink->lock); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - mutex_lock(&devlink->lock); list_del(&devlink_port->list); - mutex_unlock(&devlink->lock); WARN_ON(!list_empty(&devlink_port->reporter_list)); - WARN_ON(!list_empty(&devlink_port->region_list)); mutex_destroy(&devlink_port->reporters_lock); + devlink_port->registered = false; +} +EXPORT_SYMBOL_GPL(devl_port_unregister); + +/** + * devlink_port_unregister - Unregister devlink port + * + * @devlink_port: devlink port + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_port_unregister(struct devlink_port *devlink_port) +{ + struct devlink *devlink = devlink_port->devlink; + + devl_lock(devlink); + devl_port_unregister(devlink_port); + devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_port_unregister); @@ -9270,8 +9998,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { - if (WARN_ON(!devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); + devlink_port_type_warn_cancel(devlink_port); spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; @@ -9390,8 +10118,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) @@ -9414,8 +10142,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) @@ -9441,8 +10169,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) @@ -9469,8 +10197,8 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); if (ret) @@ -9483,30 +10211,26 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); /** - * devlink_rate_leaf_create - create devlink rate leaf - * + * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on * @priv: driver private data * * Create devlink rate object of type leaf on provided @devlink_port. - * Throws call trace if @devlink_port already has a devlink rate object. - * - * Context: Takes and release devlink->lock <mutex>. - * - * Return: -ENOMEM if failed to allocate rate object, 0 otherwise. */ -int -devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv) +int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv) { struct devlink *devlink = devlink_port->devlink; struct devlink_rate *devlink_rate; + devl_assert_locked(devlink_port->devlink); + + if (WARN_ON(devlink_port->devlink_rate)) + return -EBUSY; + devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); if (!devlink_rate) return -ENOMEM; - mutex_lock(&devlink->lock); - WARN_ON(devlink_port->devlink_rate); devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; devlink_rate->devlink = devlink; devlink_rate->devlink_port = devlink_port; @@ -9514,54 +10238,49 @@ devlink_rate_leaf_create(struct devlink_port *devlink_port, void *priv) list_add_tail(&devlink_rate->list, &devlink->rate_list); devlink_port->devlink_rate = devlink_rate; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); - mutex_unlock(&devlink->lock); return 0; } -EXPORT_SYMBOL_GPL(devlink_rate_leaf_create); +EXPORT_SYMBOL_GPL(devl_rate_leaf_create); /** - * devlink_rate_leaf_destroy - destroy devlink rate leaf + * devl_rate_leaf_destroy - destroy devlink rate leaf * * @devlink_port: devlink port linked to the rate object * - * Context: Takes and release devlink->lock <mutex>. + * Destroy the devlink rate object of type leaf on provided @devlink_port. */ -void devlink_rate_leaf_destroy(struct devlink_port *devlink_port) +void devl_rate_leaf_destroy(struct devlink_port *devlink_port) { struct devlink_rate *devlink_rate = devlink_port->devlink_rate; - struct devlink *devlink = devlink_port->devlink; + devl_assert_locked(devlink_port->devlink); if (!devlink_rate) return; - mutex_lock(&devlink->lock); devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); if (devlink_rate->parent) refcount_dec(&devlink_rate->parent->refcnt); list_del(&devlink_rate->list); devlink_port->devlink_rate = NULL; - mutex_unlock(&devlink->lock); kfree(devlink_rate); } -EXPORT_SYMBOL_GPL(devlink_rate_leaf_destroy); +EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); /** - * devlink_rate_nodes_destroy - destroy all devlink rate nodes on device - * + * devl_rate_nodes_destroy - destroy all devlink rate nodes on device * @devlink: devlink instance * * Unset parent for all rate objects and destroy all rate nodes * on specified device. - * - * Context: Takes and release devlink->lock <mutex>. */ -void devlink_rate_nodes_destroy(struct devlink *devlink) +void devl_rate_nodes_destroy(struct devlink *devlink) { static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; - mutex_lock(&devlink->lock); + devl_assert_locked(devlink); + list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (!devlink_rate->parent) continue; @@ -9582,9 +10301,23 @@ void devlink_rate_nodes_destroy(struct devlink *devlink) kfree(devlink_rate); } } - mutex_unlock(&devlink->lock); } -EXPORT_SYMBOL_GPL(devlink_rate_nodes_destroy); +EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); + +/** + * devlink_port_linecard_set - Link port with a linecard + * + * @devlink_port: devlink port + * @linecard: devlink linecard + */ +void devlink_port_linecard_set(struct devlink_port *devlink_port, + struct devlink_linecard *linecard) +{ + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + devlink_port->linecard = linecard; +} +EXPORT_SYMBOL_GPL(devlink_port_linecard_set); static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) @@ -9597,7 +10330,12 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: - n = snprintf(name, len, "p%u", attrs->phys.port_number); + if (devlink_port->linecard) + n = snprintf(name, len, "l%u", + devlink_port->linecard->index); + if (n < len) + n += snprintf(name + n, len - n, "p%u", + attrs->phys.port_number); if (n < len && attrs->split) n += snprintf(name + n, len - n, "s%u", attrs->phys.split_subport_number); @@ -9652,25 +10390,241 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, return 0; } -int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, - u32 size, u16 ingress_pools_count, - u16 egress_pools_count, u16 ingress_tc_count, - u16 egress_tc_count) +static int devlink_linecard_types_init(struct devlink_linecard *linecard) { - struct devlink_sb *devlink_sb; - int err = 0; + struct devlink_linecard_type *linecard_type; + unsigned int count; + int i; - mutex_lock(&devlink->lock); - if (devlink_sb_index_exists(devlink, sb_index)) { - err = -EEXIST; - goto unlock; + count = linecard->ops->types_count(linecard, linecard->priv); + linecard->types = kmalloc_array(count, sizeof(*linecard_type), + GFP_KERNEL); + if (!linecard->types) + return -ENOMEM; + linecard->types_count = count; + + for (i = 0; i < count; i++) { + linecard_type = &linecard->types[i]; + linecard->ops->types_get(linecard, linecard->priv, i, + &linecard_type->type, + &linecard_type->priv); } + return 0; +} - devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL); - if (!devlink_sb) { - err = -ENOMEM; - goto unlock; +static void devlink_linecard_types_fini(struct devlink_linecard *linecard) +{ + kfree(linecard->types); +} + +/** + * devlink_linecard_create - Create devlink linecard + * + * @devlink: devlink + * @linecard_index: driver-specific numerical identifier of the linecard + * @ops: linecards ops + * @priv: user priv pointer + * + * Create devlink linecard instance with provided linecard index. + * Caller can use any indexing, even hw-related one. + * + * Return: Line card structure or an ERR_PTR() encoded error code. + */ +struct devlink_linecard * +devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, + const struct devlink_linecard_ops *ops, void *priv) +{ + struct devlink_linecard *linecard; + int err; + + if (WARN_ON(!ops || !ops->provision || !ops->unprovision || + !ops->types_count || !ops->types_get)) + return ERR_PTR(-EINVAL); + + mutex_lock(&devlink->linecards_lock); + if (devlink_linecard_index_exists(devlink, linecard_index)) { + mutex_unlock(&devlink->linecards_lock); + return ERR_PTR(-EEXIST); } + + linecard = kzalloc(sizeof(*linecard), GFP_KERNEL); + if (!linecard) { + mutex_unlock(&devlink->linecards_lock); + return ERR_PTR(-ENOMEM); + } + + linecard->devlink = devlink; + linecard->index = linecard_index; + linecard->ops = ops; + linecard->priv = priv; + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + mutex_init(&linecard->state_lock); + + err = devlink_linecard_types_init(linecard); + if (err) { + mutex_destroy(&linecard->state_lock); + kfree(linecard); + mutex_unlock(&devlink->linecards_lock); + return ERR_PTR(err); + } + + list_add_tail(&linecard->list, &devlink->linecard_list); + refcount_set(&linecard->refcount, 1); + mutex_unlock(&devlink->linecards_lock); + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + return linecard; +} +EXPORT_SYMBOL_GPL(devlink_linecard_create); + +/** + * devlink_linecard_destroy - Destroy devlink linecard + * + * @linecard: devlink linecard + */ +void devlink_linecard_destroy(struct devlink_linecard *linecard) +{ + struct devlink *devlink = linecard->devlink; + + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); + mutex_lock(&devlink->linecards_lock); + list_del(&linecard->list); + devlink_linecard_types_fini(linecard); + mutex_unlock(&devlink->linecards_lock); + devlink_linecard_put(linecard); +} +EXPORT_SYMBOL_GPL(devlink_linecard_destroy); + +/** + * devlink_linecard_provision_set - Set provisioning on linecard + * + * @linecard: devlink linecard + * @type: linecard type + * + * This is either called directly from the provision() op call or + * as a result of the provision() op call asynchronously. + */ +void devlink_linecard_provision_set(struct devlink_linecard *linecard, + const char *type) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->type && strcmp(linecard->type, type)); + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; + linecard->type = type; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); + +/** + * devlink_linecard_provision_clear - Clear provisioning on linecard + * + * @linecard: devlink linecard + * + * This is either called directly from the unprovision() op call or + * as a result of the unprovision() op call asynchronously. + */ +void devlink_linecard_provision_clear(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->nested_devlink); + linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; + linecard->type = NULL; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear); + +/** + * devlink_linecard_provision_fail - Fail provisioning on linecard + * + * @linecard: devlink linecard + * + * This is either called directly from the provision() op call or + * as a result of the provision() op call asynchronously. + */ +void devlink_linecard_provision_fail(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->nested_devlink); + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail); + +/** + * devlink_linecard_activate - Set linecard active + * + * @linecard: devlink linecard + */ +void devlink_linecard_activate(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED); + linecard->state = DEVLINK_LINECARD_STATE_ACTIVE; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_activate); + +/** + * devlink_linecard_deactivate - Set linecard inactive + * + * @linecard: devlink linecard + */ +void devlink_linecard_deactivate(struct devlink_linecard *linecard) +{ + mutex_lock(&linecard->state_lock); + switch (linecard->state) { + case DEVLINK_LINECARD_STATE_ACTIVE: + linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + break; + case DEVLINK_LINECARD_STATE_UNPROVISIONING: + /* Line card is being deactivated as part + * of unprovisioning flow. + */ + break; + default: + WARN_ON(1); + break; + } + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_deactivate); + +/** + * devlink_linecard_nested_dl_set - Attach/detach nested devlink + * instance to linecard. + * + * @linecard: devlink linecard + * @nested_devlink: devlink instance to attach or NULL to detach + */ +void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard, + struct devlink *nested_devlink) +{ + mutex_lock(&linecard->state_lock); + linecard->nested_devlink = nested_devlink; + devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); + mutex_unlock(&linecard->state_lock); +} +EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set); + +int devl_sb_register(struct devlink *devlink, unsigned int sb_index, + u32 size, u16 ingress_pools_count, + u16 egress_pools_count, u16 ingress_tc_count, + u16 egress_tc_count) +{ + struct devlink_sb *devlink_sb; + + lockdep_assert_held(&devlink->lock); + + if (devlink_sb_index_exists(devlink, sb_index)) + return -EEXIST; + + devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL); + if (!devlink_sb) + return -ENOMEM; devlink_sb->index = sb_index; devlink_sb->size = size; devlink_sb->ingress_pools_count = ingress_pools_count; @@ -9678,57 +10632,78 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, devlink_sb->ingress_tc_count = ingress_tc_count; devlink_sb->egress_tc_count = egress_tc_count; list_add_tail(&devlink_sb->list, &devlink->sb_list); -unlock: - mutex_unlock(&devlink->lock); + return 0; +} +EXPORT_SYMBOL_GPL(devl_sb_register); + +int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, + u32 size, u16 ingress_pools_count, + u16 egress_pools_count, u16 ingress_tc_count, + u16 egress_tc_count) +{ + int err; + + devl_lock(devlink); + err = devl_sb_register(devlink, sb_index, size, ingress_pools_count, + egress_pools_count, ingress_tc_count, + egress_tc_count); + devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_sb_register); -void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) +void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index) { struct devlink_sb *devlink_sb; - mutex_lock(&devlink->lock); + lockdep_assert_held(&devlink->lock); + devlink_sb = devlink_sb_get_by_index(devlink, sb_index); WARN_ON(!devlink_sb); list_del(&devlink_sb->list); - mutex_unlock(&devlink->lock); kfree(devlink_sb); } +EXPORT_SYMBOL_GPL(devl_sb_unregister); + +void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) +{ + devl_lock(devlink); + devl_sb_unregister(devlink, sb_index); + devl_unlock(devlink); +} EXPORT_SYMBOL_GPL(devlink_sb_unregister); /** - * devlink_dpipe_headers_register - register dpipe headers + * devl_dpipe_headers_register - register dpipe headers * - * @devlink: devlink - * @dpipe_headers: dpipe header array + * @devlink: devlink + * @dpipe_headers: dpipe header array * - * Register the headers supported by hardware. + * Register the headers supported by hardware. */ -int devlink_dpipe_headers_register(struct devlink *devlink, - struct devlink_dpipe_headers *dpipe_headers) +void devl_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers *dpipe_headers) { - mutex_lock(&devlink->lock); + lockdep_assert_held(&devlink->lock); + devlink->dpipe_headers = dpipe_headers; - mutex_unlock(&devlink->lock); - return 0; } -EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); +EXPORT_SYMBOL_GPL(devl_dpipe_headers_register); /** - * devlink_dpipe_headers_unregister - unregister dpipe headers + * devl_dpipe_headers_unregister - unregister dpipe headers * - * @devlink: devlink + * @devlink: devlink * - * Unregister the headers supported by hardware. + * Unregister the headers supported by hardware. */ -void devlink_dpipe_headers_unregister(struct devlink *devlink) +void devl_dpipe_headers_unregister(struct devlink *devlink) { - mutex_lock(&devlink->lock); + lockdep_assert_held(&devlink->lock); + devlink->dpipe_headers = NULL; - mutex_unlock(&devlink->lock); } -EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister); +EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister); /** * devlink_dpipe_table_counter_enabled - check if counter allocation @@ -9762,38 +10737,33 @@ bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled); /** - * devlink_dpipe_table_register - register dpipe table + * devl_dpipe_table_register - register dpipe table * - * @devlink: devlink - * @table_name: table name - * @table_ops: table ops - * @priv: priv - * @counter_control_extern: external control for counters + * @devlink: devlink + * @table_name: table name + * @table_ops: table ops + * @priv: priv + * @counter_control_extern: external control for counters */ -int devlink_dpipe_table_register(struct devlink *devlink, - const char *table_name, - struct devlink_dpipe_table_ops *table_ops, - void *priv, bool counter_control_extern) +int devl_dpipe_table_register(struct devlink *devlink, + const char *table_name, + struct devlink_dpipe_table_ops *table_ops, + void *priv, bool counter_control_extern) { struct devlink_dpipe_table *table; - int err = 0; + + lockdep_assert_held(&devlink->lock); if (WARN_ON(!table_ops->size_get)) return -EINVAL; - mutex_lock(&devlink->lock); - if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name, - devlink)) { - err = -EEXIST; - goto unlock; - } + devlink)) + return -EEXIST; table = kzalloc(sizeof(*table), GFP_KERNEL); - if (!table) { - err = -ENOMEM; - goto unlock; - } + if (!table) + return -ENOMEM; table->name = table_name; table->table_ops = table_ops; @@ -9801,77 +10771,69 @@ int devlink_dpipe_table_register(struct devlink *devlink, table->counter_control_extern = counter_control_extern; list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); -unlock: - mutex_unlock(&devlink->lock); - return err; + + return 0; } -EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); +EXPORT_SYMBOL_GPL(devl_dpipe_table_register); /** - * devlink_dpipe_table_unregister - unregister dpipe table + * devl_dpipe_table_unregister - unregister dpipe table * - * @devlink: devlink - * @table_name: table name + * @devlink: devlink + * @table_name: table name */ -void devlink_dpipe_table_unregister(struct devlink *devlink, - const char *table_name) +void devl_dpipe_table_unregister(struct devlink *devlink, + const char *table_name) { struct devlink_dpipe_table *table; - mutex_lock(&devlink->lock); + lockdep_assert_held(&devlink->lock); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name, devlink); if (!table) - goto unlock; + return; list_del_rcu(&table->list); - mutex_unlock(&devlink->lock); kfree_rcu(table, rcu); - return; -unlock: - mutex_unlock(&devlink->lock); } -EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); +EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister); /** - * devlink_resource_register - devlink resource register + * devl_resource_register - devlink resource register * - * @devlink: devlink - * @resource_name: resource's name - * @resource_size: resource's size - * @resource_id: resource's id - * @parent_resource_id: resource's parent id - * @size_params: size parameters + * @devlink: devlink + * @resource_name: resource's name + * @resource_size: resource's size + * @resource_id: resource's id + * @parent_resource_id: resource's parent id + * @size_params: size parameters * - * Generic resources should reuse the same names across drivers. - * Please see the generic resources list at: - * Documentation/networking/devlink/devlink-resource.rst + * Generic resources should reuse the same names across drivers. + * Please see the generic resources list at: + * Documentation/networking/devlink/devlink-resource.rst */ -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) +int devl_resource_register(struct devlink *devlink, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params) { struct devlink_resource *resource; struct list_head *resource_list; bool top_hierarchy; - int err = 0; + + lockdep_assert_held(&devlink->lock); top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP; - mutex_lock(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); - if (resource) { - err = -EINVAL; - goto out; - } + if (resource) + return -EINVAL; resource = kzalloc(sizeof(*resource), GFP_KERNEL); - if (!resource) { - err = -ENOMEM; - goto out; - } + if (!resource) + return -ENOMEM; if (top_hierarchy) { resource_list = &devlink->resource_list; @@ -9885,8 +10847,7 @@ int devlink_resource_register(struct devlink *devlink, resource->parent = parent_resource; } else { kfree(resource); - err = -EINVAL; - goto out; + return -EINVAL; } } @@ -9899,101 +10860,168 @@ int devlink_resource_register(struct devlink *devlink, sizeof(resource->size_params)); INIT_LIST_HEAD(&resource->resource_list); list_add_tail(&resource->list, resource_list); -out: - mutex_unlock(&devlink->lock); - return err; + + return 0; } -EXPORT_SYMBOL_GPL(devlink_resource_register); +EXPORT_SYMBOL_GPL(devl_resource_register); /** - * devlink_resources_unregister - free all resources + * devlink_resource_register - devlink resource register * * @devlink: devlink - * @resource: resource + * @resource_name: resource's name + * @resource_size: resource's size + * @resource_id: resource's id + * @parent_resource_id: resource's parent id + * @size_params: size parameters + * + * Generic resources should reuse the same names across drivers. + * Please see the generic resources list at: + * Documentation/networking/devlink/devlink-resource.rst + * + * Context: Takes and release devlink->lock <mutex>. */ -void devlink_resources_unregister(struct devlink *devlink, - struct devlink_resource *resource) +int devlink_resource_register(struct devlink *devlink, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params) +{ + int err; + + devl_lock(devlink); + err = devl_resource_register(devlink, resource_name, resource_size, + resource_id, parent_resource_id, size_params); + devl_unlock(devlink); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_register); + +static void devlink_resource_unregister(struct devlink *devlink, + struct devlink_resource *resource) { struct devlink_resource *tmp, *child_resource; - struct list_head *resource_list; - if (resource) - resource_list = &resource->resource_list; - else - resource_list = &devlink->resource_list; + list_for_each_entry_safe(child_resource, tmp, &resource->resource_list, + list) { + devlink_resource_unregister(devlink, child_resource); + list_del(&child_resource->list); + kfree(child_resource); + } +} - if (!resource) - mutex_lock(&devlink->lock); +/** + * devl_resources_unregister - free all resources + * + * @devlink: devlink + */ +void devl_resources_unregister(struct devlink *devlink) +{ + struct devlink_resource *tmp, *child_resource; + + lockdep_assert_held(&devlink->lock); - list_for_each_entry_safe(child_resource, tmp, resource_list, list) { - devlink_resources_unregister(devlink, child_resource); + list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list, + list) { + devlink_resource_unregister(devlink, child_resource); list_del(&child_resource->list); kfree(child_resource); } +} +EXPORT_SYMBOL_GPL(devl_resources_unregister); - if (!resource) - mutex_unlock(&devlink->lock); +/** + * devlink_resources_unregister - free all resources + * + * @devlink: devlink + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_resources_unregister(struct devlink *devlink) +{ + devl_lock(devlink); + devl_resources_unregister(devlink); + devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_resources_unregister); /** - * devlink_resource_size_get - get and update size + * devl_resource_size_get - get and update size * - * @devlink: devlink - * @resource_id: the requested resource id - * @p_resource_size: ptr to update + * @devlink: devlink + * @resource_id: the requested resource id + * @p_resource_size: ptr to update */ -int devlink_resource_size_get(struct devlink *devlink, - u64 resource_id, - u64 *p_resource_size) +int devl_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size) { struct devlink_resource *resource; - int err = 0; - mutex_lock(&devlink->lock); + lockdep_assert_held(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); - if (!resource) { - err = -EINVAL; - goto out; - } + if (!resource) + return -EINVAL; *p_resource_size = resource->size_new; resource->size = resource->size_new; -out: - mutex_unlock(&devlink->lock); - return err; + return 0; } -EXPORT_SYMBOL_GPL(devlink_resource_size_get); +EXPORT_SYMBOL_GPL(devl_resource_size_get); /** - * devlink_dpipe_table_resource_set - set the resource id + * devl_dpipe_table_resource_set - set the resource id * - * @devlink: devlink - * @table_name: table name - * @resource_id: resource id - * @resource_units: number of resource's units consumed per table's entry + * @devlink: devlink + * @table_name: table name + * @resource_id: resource id + * @resource_units: number of resource's units consumed per table's entry */ -int devlink_dpipe_table_resource_set(struct devlink *devlink, - const char *table_name, u64 resource_id, - u64 resource_units) +int devl_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units) { struct devlink_dpipe_table *table; - int err = 0; - mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name, devlink); - if (!table) { - err = -EINVAL; - goto out; - } + if (!table) + return -EINVAL; + table->resource_id = resource_id; table->resource_units = resource_units; table->resource_valid = true; -out: - mutex_unlock(&devlink->lock); - return err; + return 0; +} +EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set); + +/** + * devl_resource_occ_get_register - register occupancy getter + * + * @devlink: devlink + * @resource_id: resource id + * @occ_get: occupancy getter callback + * @occ_get_priv: occupancy getter callback priv + */ +void devl_resource_occ_get_register(struct devlink *devlink, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ + struct devlink_resource *resource; + + lockdep_assert_held(&devlink->lock); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (WARN_ON(!resource)) + return; + WARN_ON(resource->occ_get); + + resource->occ_get = occ_get; + resource->occ_get_priv = occ_get_priv; } -EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set); +EXPORT_SYMBOL_GPL(devl_resource_occ_get_register); /** * devlink_resource_occ_get_register - register occupancy getter @@ -10002,48 +11030,58 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set); * @resource_id: resource id * @occ_get: occupancy getter callback * @occ_get_priv: occupancy getter callback priv + * + * Context: Takes and release devlink->lock <mutex>. */ void devlink_resource_occ_get_register(struct devlink *devlink, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv) { + devl_lock(devlink); + devl_resource_occ_get_register(devlink, resource_id, + occ_get, occ_get_priv); + devl_unlock(devlink); +} +EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register); + +/** + * devl_resource_occ_get_unregister - unregister occupancy getter + * + * @devlink: devlink + * @resource_id: resource id + */ +void devl_resource_occ_get_unregister(struct devlink *devlink, + u64 resource_id) +{ struct devlink_resource *resource; - mutex_lock(&devlink->lock); + lockdep_assert_held(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); if (WARN_ON(!resource)) - goto out; - WARN_ON(resource->occ_get); + return; + WARN_ON(!resource->occ_get); - resource->occ_get = occ_get; - resource->occ_get_priv = occ_get_priv; -out: - mutex_unlock(&devlink->lock); + resource->occ_get = NULL; + resource->occ_get_priv = NULL; } -EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register); +EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister); /** * devlink_resource_occ_get_unregister - unregister occupancy getter * * @devlink: devlink * @resource_id: resource id + * + * Context: Takes and release devlink->lock <mutex>. */ void devlink_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id) { - struct devlink_resource *resource; - - mutex_lock(&devlink->lock); - resource = devlink_resource_find(devlink, NULL, resource_id); - if (WARN_ON(!resource)) - goto out; - WARN_ON(!resource->occ_get); - - resource->occ_get = NULL; - resource->occ_get_priv = NULL; -out: - mutex_unlock(&devlink->lock); + devl_lock(devlink); + devl_resource_occ_get_unregister(devlink, resource_id); + devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); @@ -10264,51 +11302,67 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) EXPORT_SYMBOL_GPL(devlink_param_value_changed); /** - * devlink_region_create - create a new address region + * devl_region_create - create a new address region * - * @devlink: devlink - * @ops: region operations and name - * @region_max_snapshots: Maximum supported number of snapshots for region - * @region_size: size of region + * @devlink: devlink + * @ops: region operations and name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region */ -struct devlink_region * -devlink_region_create(struct devlink *devlink, - const struct devlink_region_ops *ops, - u32 region_max_snapshots, u64 region_size) +struct devlink_region *devl_region_create(struct devlink *devlink, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, + u64 region_size) { struct devlink_region *region; - int err = 0; + + devl_assert_locked(devlink); if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); - mutex_lock(&devlink->lock); - - if (devlink_region_get_by_name(devlink, ops->name)) { - err = -EEXIST; - goto unlock; - } + if (devlink_region_get_by_name(devlink, ops->name)) + return ERR_PTR(-EEXIST); region = kzalloc(sizeof(*region), GFP_KERNEL); - if (!region) { - err = -ENOMEM; - goto unlock; - } + if (!region) + return ERR_PTR(-ENOMEM); region->devlink = devlink; region->max_snapshots = region_max_snapshots; region->ops = ops; region->size = region_size; INIT_LIST_HEAD(®ion->snapshot_list); + mutex_init(®ion->snapshot_lock); list_add_tail(®ion->list, &devlink->region_list); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); - mutex_unlock(&devlink->lock); return region; +} +EXPORT_SYMBOL_GPL(devl_region_create); -unlock: - mutex_unlock(&devlink->lock); - return ERR_PTR(err); +/** + * devlink_region_create - create a new address region + * + * @devlink: devlink + * @ops: region operations and name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region + * + * Context: Takes and release devlink->lock <mutex>. + */ +struct devlink_region * +devlink_region_create(struct devlink *devlink, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, u64 region_size) +{ + struct devlink_region *region; + + devl_lock(devlink); + region = devl_region_create(devlink, ops, region_max_snapshots, + region_size); + devl_unlock(devlink); + return region; } EXPORT_SYMBOL_GPL(devlink_region_create); @@ -10319,6 +11373,8 @@ EXPORT_SYMBOL_GPL(devlink_region_create); * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region + * + * Context: Takes and release devlink->lock <mutex>. */ struct devlink_region * devlink_port_region_create(struct devlink_port *port, @@ -10329,10 +11385,12 @@ devlink_port_region_create(struct devlink_port *port, struct devlink_region *region; int err = 0; + ASSERT_DEVLINK_PORT_INITIALIZED(port); + if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); - mutex_lock(&devlink->lock); + devl_lock(devlink); if (devlink_port_region_get_by_name(port, ops->name)) { err = -EEXIST; @@ -10351,40 +11409,58 @@ devlink_port_region_create(struct devlink_port *port, region->port_ops = ops; region->size = region_size; INIT_LIST_HEAD(®ion->snapshot_list); + mutex_init(®ion->snapshot_lock); list_add_tail(®ion->list, &port->region_list); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); - mutex_unlock(&devlink->lock); + devl_unlock(devlink); return region; unlock: - mutex_unlock(&devlink->lock); + devl_unlock(devlink); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(devlink_port_region_create); /** - * devlink_region_destroy - destroy address region + * devl_region_destroy - destroy address region * - * @region: devlink region to destroy + * @region: devlink region to destroy */ -void devlink_region_destroy(struct devlink_region *region) +void devl_region_destroy(struct devlink_region *region) { struct devlink *devlink = region->devlink; struct devlink_snapshot *snapshot, *ts; - mutex_lock(&devlink->lock); + devl_assert_locked(devlink); /* Free all snapshots of region */ list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) devlink_region_snapshot_del(region, snapshot); list_del(®ion->list); + mutex_destroy(®ion->snapshot_lock); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); - mutex_unlock(&devlink->lock); kfree(region); } +EXPORT_SYMBOL_GPL(devl_region_destroy); + +/** + * devlink_region_destroy - destroy address region + * + * @region: devlink region to destroy + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_region_destroy(struct devlink_region *region) +{ + struct devlink *devlink = region->devlink; + + devl_lock(devlink); + devl_region_destroy(region); + devl_unlock(devlink); +} EXPORT_SYMBOL_GPL(devlink_region_destroy); /** @@ -10404,13 +11480,7 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy); */ int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { - int err; - - mutex_lock(&devlink->lock); - err = __devlink_region_snapshot_id_get(devlink, id); - mutex_unlock(&devlink->lock); - - return err; + return __devlink_region_snapshot_id_get(devlink, id); } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); @@ -10426,9 +11496,7 @@ EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); */ void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id) { - mutex_lock(&devlink->lock); __devlink_snapshot_id_decrement(devlink, id); - mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); @@ -10447,13 +11515,11 @@ EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); int devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id) { - struct devlink *devlink = region->devlink; int err; - mutex_lock(&devlink->lock); + mutex_lock(®ion->snapshot_lock); err = __devlink_region_snapshot_create(region, data, snapshot_id); - mutex_unlock(&devlink->lock); - + mutex_unlock(®ion->snapshot_lock); return err; } EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); @@ -10818,7 +11884,7 @@ static void devlink_trap_disable(struct devlink *devlink, } /** - * devlink_traps_register - Register packet traps with devlink. + * devl_traps_register - Register packet traps with devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. @@ -10826,16 +11892,16 @@ static void devlink_trap_disable(struct devlink *devlink, * * Return: Non-zero value on failure. */ -int devlink_traps_register(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count, void *priv) +int devl_traps_register(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count, void *priv) { int i, err; if (!devlink->ops->trap_init || !devlink->ops->trap_action_set) return -EINVAL; - mutex_lock(&devlink->lock); + devl_assert_locked(devlink); for (i = 0; i < traps_count; i++) { const struct devlink_trap *trap = &traps[i]; @@ -10847,7 +11913,6 @@ int devlink_traps_register(struct devlink *devlink, if (err) goto err_trap_register; } - mutex_unlock(&devlink->lock); return 0; @@ -10855,24 +11920,47 @@ err_trap_register: err_trap_verify: for (i--; i >= 0; i--) devlink_trap_unregister(devlink, &traps[i]); - mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devl_traps_register); + +/** + * devlink_traps_register - Register packet traps with devlink. + * @devlink: devlink. + * @traps: Packet traps. + * @traps_count: Count of provided packet traps. + * @priv: Driver private information. + * + * Context: Takes and release devlink->lock <mutex>. + * + * Return: Non-zero value on failure. + */ +int devlink_traps_register(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count, void *priv) +{ + int err; + + devl_lock(devlink); + err = devl_traps_register(devlink, traps, traps_count, priv); + devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_traps_register); /** - * devlink_traps_unregister - Unregister packet traps from devlink. + * devl_traps_unregister - Unregister packet traps from devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. */ -void devlink_traps_unregister(struct devlink *devlink, - const struct devlink_trap *traps, - size_t traps_count) +void devl_traps_unregister(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count) { int i; - mutex_lock(&devlink->lock); + devl_assert_locked(devlink); /* Make sure we do not have any packets in-flight while unregistering * traps by disabling all of them and waiting for a grace period. */ @@ -10881,7 +11969,24 @@ void devlink_traps_unregister(struct devlink *devlink, synchronize_rcu(); for (i = traps_count - 1; i >= 0; i--) devlink_trap_unregister(devlink, &traps[i]); - mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_traps_unregister); + +/** + * devlink_traps_unregister - Unregister packet traps from devlink. + * @devlink: devlink. + * @traps: Packet traps. + * @traps_count: Count of provided packet traps. + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_traps_unregister(struct devlink *devlink, + const struct devlink_trap *traps, + size_t traps_count) +{ + devl_lock(devlink); + devl_traps_unregister(devlink, traps, traps_count); + devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_traps_unregister); @@ -10893,8 +11998,8 @@ devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, stats = this_cpu_ptr(trap_stats); u64_stats_update_begin(&stats->syncp); - stats->rx_bytes += skb_len; - stats->rx_packets++; + u64_stats_add(&stats->rx_bytes, skb_len); + u64_stats_inc(&stats->rx_packets); u64_stats_update_end(&stats->syncp); } @@ -11040,20 +12145,20 @@ devlink_trap_group_unregister(struct devlink *devlink, } /** - * devlink_trap_groups_register - Register packet trap groups with devlink. + * devl_trap_groups_register - Register packet trap groups with devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Return: Non-zero value on failure. */ -int devlink_trap_groups_register(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) +int devl_trap_groups_register(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) { int i, err; - mutex_lock(&devlink->lock); + devl_assert_locked(devlink); for (i = 0; i < groups_count; i++) { const struct devlink_trap_group *group = &groups[i]; @@ -11065,7 +12170,6 @@ int devlink_trap_groups_register(struct devlink *devlink, if (err) goto err_trap_group_register; } - mutex_unlock(&devlink->lock); return 0; @@ -11073,27 +12177,66 @@ err_trap_group_register: err_trap_group_verify: for (i--; i >= 0; i--) devlink_trap_group_unregister(devlink, &groups[i]); - mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devl_trap_groups_register); + +/** + * devlink_trap_groups_register - Register packet trap groups with devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + * + * Context: Takes and release devlink->lock <mutex>. + * + * Return: Non-zero value on failure. + */ +int devlink_trap_groups_register(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + int err; + + devl_lock(devlink); + err = devl_trap_groups_register(devlink, groups, groups_count); + devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_trap_groups_register); /** - * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. + * devl_trap_groups_unregister - Unregister packet trap groups from devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. */ -void devlink_trap_groups_unregister(struct devlink *devlink, - const struct devlink_trap_group *groups, - size_t groups_count) +void devl_trap_groups_unregister(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) { int i; - mutex_lock(&devlink->lock); + devl_assert_locked(devlink); for (i = groups_count - 1; i >= 0; i--) devlink_trap_group_unregister(devlink, &groups[i]); - mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devl_trap_groups_unregister); + +/** + * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + * + * Context: Takes and release devlink->lock <mutex>. + */ +void devlink_trap_groups_unregister(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + devl_lock(devlink); + devl_trap_groups_unregister(devlink, groups, groups_count); + devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister); @@ -11179,7 +12322,7 @@ devlink_trap_policer_unregister(struct devlink *devlink, } /** - * devlink_trap_policers_register - Register packet trap policers with devlink. + * devl_trap_policers_register - Register packet trap policers with devlink. * @devlink: devlink. * @policers: Packet trap policers. * @policers_count: Count of provided packet trap policers. @@ -11187,13 +12330,13 @@ devlink_trap_policer_unregister(struct devlink *devlink, * Return: Non-zero value on failure. */ int -devlink_trap_policers_register(struct devlink *devlink, - const struct devlink_trap_policer *policers, - size_t policers_count) +devl_trap_policers_register(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count) { int i, err; - mutex_lock(&devlink->lock); + devl_assert_locked(devlink); for (i = 0; i < policers_count; i++) { const struct devlink_trap_policer *policer = &policers[i]; @@ -11208,44 +12351,40 @@ devlink_trap_policers_register(struct devlink *devlink, if (err) goto err_trap_policer_register; } - mutex_unlock(&devlink->lock); - return 0; err_trap_policer_register: err_trap_policer_verify: for (i--; i >= 0; i--) devlink_trap_policer_unregister(devlink, &policers[i]); - mutex_unlock(&devlink->lock); return err; } -EXPORT_SYMBOL_GPL(devlink_trap_policers_register); +EXPORT_SYMBOL_GPL(devl_trap_policers_register); /** - * devlink_trap_policers_unregister - Unregister packet trap policers from devlink. + * devl_trap_policers_unregister - Unregister packet trap policers from devlink. * @devlink: devlink. * @policers: Packet trap policers. * @policers_count: Count of provided packet trap policers. */ void -devlink_trap_policers_unregister(struct devlink *devlink, - const struct devlink_trap_policer *policers, - size_t policers_count) +devl_trap_policers_unregister(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count) { int i; - mutex_lock(&devlink->lock); + devl_assert_locked(devlink); for (i = policers_count - 1; i >= 0; i--) devlink_trap_policer_unregister(devlink, &policers[i]); - mutex_unlock(&devlink->lock); } -EXPORT_SYMBOL_GPL(devlink_trap_policers_unregister); +EXPORT_SYMBOL_GPL(devl_trap_policers_unregister); static void __devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len) { + struct devlink_info_req req = {}; const struct nlattr *nlattr; - struct devlink_info_req req; struct sk_buff *msg; int rem, err; @@ -11291,9 +12430,9 @@ void devlink_compat_running_version(struct devlink *devlink, if (!devlink->ops->info_get) return; - mutex_lock(&devlink->lock); + devl_lock(devlink); __devlink_compat_running_version(devlink, buf, len); - mutex_unlock(&devlink->lock); + devl_unlock(devlink); } int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) @@ -11308,11 +12447,11 @@ int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) if (ret) return ret; - mutex_lock(&devlink->lock); + devl_lock(devlink); devlink_flash_update_begin_notify(devlink); ret = devlink->ops->flash_update(devlink, ¶ms, NULL); devlink_flash_update_end_notify(devlink); - mutex_unlock(&devlink->lock); + devl_unlock(devlink); release_firmware(params.fw); @@ -11365,25 +12504,18 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) /* In case network namespace is getting destroyed, reload * all devlink instances from this namespace into init_net. */ - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), net)) - goto retry; - + devlinks_xa_for_each_registered_get(net, index, devlink) { WARN_ON(!(devlink->features & DEVLINK_F_RELOAD)); + mutex_lock(&devlink->lock); err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_LIMIT_UNSPEC, &actions_performed, NULL); + mutex_unlock(&devlink->lock); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); -retry: devlink_put(devlink); } - mutex_unlock(&devlink_mutex); } static struct pernet_operations devlink_pernet_ops __net_initdata = { diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 49442cae6f69..f084a4a6b7ab 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -51,12 +51,11 @@ static bool monitor_hw; /* net_dm_mutex * * An overall lock guarding every operation coming from userspace. - * It also guards the global 'hw_stats_list' list. */ static DEFINE_MUTEX(net_dm_mutex); struct net_dm_stats { - u64 dropped; + u64_stats_t dropped; struct u64_stats_sync syncp; }; @@ -87,11 +86,9 @@ struct per_cpu_dm_data { }; struct dm_hw_stat_delta { - struct net_device *dev; unsigned long last_rx; - struct list_head list; - struct rcu_head rcu; unsigned long last_drop_val; + struct rcu_head rcu; }; static struct genl_family net_drop_monitor_family; @@ -102,7 +99,6 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; -static LIST_HEAD(hw_stats_list); static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; static u32 net_dm_trunc_len; @@ -110,7 +106,8 @@ static u32 net_dm_queue_len = 1000; struct net_dm_alert_ops { void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, - void *location); + void *location, + enum skb_drop_reason reason); void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, int work, int budget); void (*work_item_func)(struct work_struct *work); @@ -125,6 +122,7 @@ struct net_dm_skb_cb { struct devlink_trap_metadata *hw_metadata; void *pc; }; + enum skb_drop_reason reason; }; #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) @@ -262,7 +260,9 @@ out: spin_unlock_irqrestore(&data->lock, flags); } -static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) +static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, + void *location, + enum skb_drop_reason reason) { trace_drop_common(skb, location); } @@ -270,29 +270,27 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *locatio static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { - struct dm_hw_stat_delta *new_stat; - + struct net_device *dev = napi->dev; + struct dm_hw_stat_delta *stat; /* * Don't check napi structures with no associated device */ - if (!napi->dev) + if (!dev) return; rcu_read_lock(); - list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + stat = rcu_dereference(dev->dm_private); + if (stat) { /* * only add a note to our monitor buffer if: - * 1) this is the dev we received on - * 2) its after the last_rx delta - * 3) our rx_dropped count has gone up + * 1) its after the last_rx delta + * 2) our rx_dropped count has gone up */ - if ((new_stat->dev == napi->dev) && - (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && - (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && + (dev->stats.rx_dropped != stat->last_drop_val)) { trace_drop_common(NULL, NULL); - new_stat->last_drop_val = napi->dev->stats.rx_dropped; - new_stat->last_rx = jiffies; - break; + stat->last_drop_val = dev->stats.rx_dropped; + stat->last_rx = jiffies; } } rcu_read_unlock(); @@ -466,7 +464,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, goto out; hw_entry = &hw_entries->entries[hw_entries->num_entries]; - strlcpy(hw_entry->trap_name, metadata->trap_name, + strscpy(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1); hw_entry->count = 1; hw_entries->num_entries++; @@ -490,10 +488,12 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = { static void net_dm_packet_trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, - void *location) + void *location, + enum skb_drop_reason reason) { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; + struct net_dm_skb_cb *cb; struct sk_buff *nskb; unsigned long flags; @@ -504,7 +504,11 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore, if (!nskb) return; - NET_DM_SKB_CB(nskb)->pc = location; + if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0)) + reason = SKB_DROP_REASON_NOT_SPECIFIED; + cb = NET_DM_SKB_CB(nskb); + cb->reason = reason; + cb->pc = location; /* Override the timestamp because we care about the time when the * packet was dropped. */ @@ -526,7 +530,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore, unlock_free: spin_unlock_irqrestore(&data->drop_queue.lock, flags); u64_stats_update_begin(&data->stats.syncp); - data->stats.dropped++; + u64_stats_inc(&data->stats.dropped); u64_stats_update_end(&data->stats.syncp); consume_skb(nskb); } @@ -549,7 +553,8 @@ static size_t net_dm_in_port_size(void) #define NET_DM_MAX_SYMBOL_LEN 40 -static size_t net_dm_packet_report_size(size_t payload_len) +static size_t net_dm_packet_report_size(size_t payload_len, + enum skb_drop_reason reason) { size_t size; @@ -570,6 +575,8 @@ static size_t net_dm_packet_report_size(size_t payload_len) nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + + /* NET_DM_ATTR_REASON */ + nla_total_size(strlen(drop_reasons[reason]) + 1) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } @@ -602,7 +609,7 @@ nla_put_failure: static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { - u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc; + struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb); char buf[NET_DM_MAX_SYMBOL_LEN]; struct nlattr *attr; void *hdr; @@ -616,10 +623,15 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD)) + if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc, + NET_DM_ATTR_PAD)) goto nla_put_failure; - snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc); + if (nla_put_string(msg, NET_DM_ATTR_REASON, + drop_reasons[cb->reason])) + goto nla_put_failure; + + snprintf(buf, sizeof(buf), "%pS", cb->pc); if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) goto nla_put_failure; @@ -675,7 +687,9 @@ static void net_dm_packet_report(struct sk_buff *skb) if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); - msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); + msg = nlmsg_new(net_dm_packet_report_size(payload_len, + NET_DM_SKB_CB(skb)->reason), + GFP_KERNEL); if (!msg) goto out; @@ -850,7 +864,8 @@ net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) } hw_metadata->input_dev = metadata->input_dev; - dev_hold(hw_metadata->input_dev); + netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker, + GFP_ATOMIC); return hw_metadata; @@ -864,9 +879,9 @@ free_hw_metadata: } static void -net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata) +net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata) { - dev_put(hw_metadata->input_dev); + netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker); kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); @@ -971,7 +986,7 @@ net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink, unlock_free: spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); u64_stats_update_begin(&hw_data->stats.syncp); - hw_data->stats.dropped++; + u64_stats_inc(&hw_data->stats.dropped); u64_stats_update_end(&hw_data->stats.syncp); net_dm_hw_metadata_free(n_hw_metadata); free: @@ -1161,7 +1176,6 @@ err_module_put: static void net_dm_trace_off_set(void) { - struct dm_hw_stat_delta *new_stat, *temp; const struct net_dm_alert_ops *ops; int cpu; @@ -1185,13 +1199,6 @@ static void net_dm_trace_off_set(void) consume_skb(skb); } - list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { - if (new_stat->dev == NULL) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - } - } - module_put(THIS_MODULE); } @@ -1426,10 +1433,10 @@ static void net_dm_stats_read(struct net_dm_stats *stats) do { start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); - dropped = cpu_stats->dropped; + dropped = u64_stats_read(&cpu_stats->dropped); } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); - stats->dropped += dropped; + u64_stats_add(&stats->dropped, dropped); } } @@ -1445,7 +1452,7 @@ static int net_dm_stats_put(struct sk_buff *msg) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, - stats.dropped, NET_DM_ATTR_PAD)) + u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); @@ -1470,10 +1477,10 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats) do { start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); - dropped = cpu_stats->dropped; + dropped = u64_stats_read(&cpu_stats->dropped); } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); - stats->dropped += dropped; + u64_stats_add(&stats->dropped, dropped); } } @@ -1489,7 +1496,7 @@ static int net_dm_hw_stats_put(struct sk_buff *msg) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, - stats.dropped, NET_DM_ATTR_PAD)) + u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); @@ -1552,38 +1559,28 @@ static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dm_hw_stat_delta *new_stat = NULL; - struct dm_hw_stat_delta *tmp; + struct dm_hw_stat_delta *stat; switch (event) { case NETDEV_REGISTER: - new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) + break; + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) + break; - if (!new_stat) - goto out; + stat->last_rx = jiffies; + rcu_assign_pointer(dev->dm_private, stat); - new_stat->dev = dev; - new_stat->last_rx = jiffies; - mutex_lock(&net_dm_mutex); - list_add_rcu(&new_stat->list, &hw_stats_list); - mutex_unlock(&net_dm_mutex); break; case NETDEV_UNREGISTER: - mutex_lock(&net_dm_mutex); - list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { - if (new_stat->dev == dev) { - new_stat->dev = NULL; - if (trace_state == TRACE_OFF) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - break; - } - } + stat = rtnl_dereference(dev->dm_private); + if (stat) { + rcu_assign_pointer(dev->dm_private, NULL); + kfree_rcu(stat, rcu); } - mutex_unlock(&net_dm_mutex); break; } -out: return NOTIFY_DONE; } @@ -1648,6 +1645,7 @@ static struct genl_family net_drop_monitor_family __ro_after_init = { .module = THIS_MODULE, .small_ops = dropmon_ops, .n_small_ops = ARRAY_SIZE(dropmon_ops), + .resv_start_op = NET_DM_CMD_STATS_GET + 1, .mcgrps = dropmon_mcgrps, .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), }; diff --git a/net/core/dst.c b/net/core/dst.c index 497ef9b3fc6a..bc9c9be4e080 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -49,7 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, unsigned short flags) { dst->dev = dev; - dev_hold(dev); + netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; @@ -117,7 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) if (dst->ops->destroy) dst->ops->destroy(dst); - dev_put(dst->dev); + netdev_put(dst->dev, &dst->dev_tracker); lwtstate_put(dst->lwtstate); @@ -159,8 +159,8 @@ void dst_dev_put(struct dst_entry *dst) dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; - dev_hold(dst->dev); - dev_put(dev); + netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, + GFP_ATOMIC); } EXPORT_SYMBOL(dst_dev_put); diff --git a/net/core/failover.c b/net/core/failover.c index b5cd3c727285..864d2d83eff4 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -252,7 +252,7 @@ struct failover *failover_register(struct net_device *dev, return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); - dev_hold(dev); + netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); @@ -285,7 +285,7 @@ void failover_unregister(struct failover *failover) failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; - dev_put(failover_dev); + netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 1bb567a3b329..75282222e0b4 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -750,6 +750,27 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, return 0; } +static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { + [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, + [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, + [FRA_PRIORITY] = { .type = NLA_U32 }, + [FRA_FWMARK] = { .type = NLA_U32 }, + [FRA_FLOW] = { .type = NLA_U32 }, + [FRA_TUN_ID] = { .type = NLA_U64 }, + [FRA_FWMASK] = { .type = NLA_U32 }, + [FRA_TABLE] = { .type = NLA_U32 }, + [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, + [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, + [FRA_GOTO] = { .type = NLA_U32 }, + [FRA_L3MDEV] = { .type = NLA_U8 }, + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, + [FRA_PROTOCOL] = { .type = NLA_U8 }, + [FRA_IP_PROTO] = { .type = NLA_U8 }, + [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, + [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } +}; + int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -774,7 +795,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, - ops->policy, extack); + fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; @@ -882,7 +903,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, - ops->policy, extack); + fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; diff --git a/net/core/filter.c b/net/core/filter.c index 6102f093d59a..bb0136e7a8e4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -18,6 +18,7 @@ */ #include <linux/atomic.h> +#include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> @@ -78,6 +79,7 @@ #include <linux/btf_ids.h> #include <net/tls.h> #include <net/xdp.h> +#include <net/mptcp.h> static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -236,7 +238,7 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u16 tmp, *ptr; + __be16 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { @@ -263,7 +265,7 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u32 tmp, *ptr; + __be32 tmp, *ptr; const int len = sizeof(tmp); if (likely(offset >= 0)) { @@ -301,7 +303,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, break; case SKF_AD_PKTTYPE: - *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); @@ -323,7 +325,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, offsetof(struct sk_buff, vlan_tci)); break; case SKF_AD_VLAN_TAG_PRESENT: - *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET); if (PKT_VLAN_PRESENT_BIT) *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); if (PKT_VLAN_PRESENT_BIT < 7) @@ -1213,10 +1215,11 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); + int optmem_max = READ_ONCE(sysctl_optmem_max); /* same check as in sock_kmalloc() */ - if (filter_size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { + if (filter_size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } @@ -1242,10 +1245,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) int err, new_len, old_len = fp->len; bool seen_ld_abs = false; - /* We are free to overwrite insns et al right here as it - * won't be used at this point in time anymore internally - * after the migration to the internal BPF instruction - * representation. + /* We are free to overwrite insns et al right here as it won't be used at + * this point in time anymore internally after the migration to the eBPF + * instruction representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); @@ -1336,8 +1338,8 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, */ bpf_jit_compile(fp); - /* JIT compiler couldn't process this filter, so do the - * internal BPF translation for the optimized interpreter. + /* JIT compiler couldn't process this filter, so do the eBPF translation + * for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); @@ -1548,7 +1550,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_size(prog->len) > sysctl_optmem_max) + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); @@ -1615,7 +1617,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ - if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { err = -ENOMEM; goto err_prog_put; } @@ -1688,7 +1690,7 @@ BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; - if (unlikely(offset > 0xffff)) + if (unlikely(offset > INT_MAX)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; @@ -1713,7 +1715,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -1723,7 +1725,7 @@ BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, { void *ptr; - if (unlikely(offset > 0xffff)) + if (unlikely(offset > INT_MAX)) goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); @@ -2018,9 +2020,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -2108,7 +2110,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); @@ -2177,7 +2179,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); @@ -2275,7 +2277,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); @@ -2541,7 +2543,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -2604,7 +2606,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) + if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2711,6 +2713,9 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, if (unlikely(flags)) return -EINVAL; + if (unlikely(len == 0)) + return 0; + /* First find the starting scatterlist element */ i = msg->sg.start; do { @@ -2810,7 +2815,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - __clear_bit(new, &msg->sg.copy); + __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); @@ -3006,7 +3011,7 @@ BPF_CALL_0(bpf_get_cgroup_classid_curr) return __task_get_classid(current); } -static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { +const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, @@ -3784,6 +3789,28 @@ static const struct bpf_func_proto sk_skb_change_head_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) +{ + return xdp_get_buff_len(xdp); +} + +static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) + +const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { + .func = bpf_xdp_get_buff_len, + .gpl_only = false, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], +}; + static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -3818,11 +3845,208 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush) +{ + unsigned long ptr_len, ptr_off = 0; + skb_frag_t *next_frag, *end_frag; + struct skb_shared_info *sinfo; + void *src, *dst; + u8 *ptr_buf; + + if (likely(xdp->data_end - xdp->data >= off + len)) { + src = flush ? buf : xdp->data + off; + dst = flush ? xdp->data + off : buf; + memcpy(dst, src, len); + return; + } + + sinfo = xdp_get_shared_info_from_buff(xdp); + end_frag = &sinfo->frags[sinfo->nr_frags]; + next_frag = &sinfo->frags[0]; + + ptr_len = xdp->data_end - xdp->data; + ptr_buf = xdp->data; + + while (true) { + if (off < ptr_off + ptr_len) { + unsigned long copy_off = off - ptr_off; + unsigned long copy_len = min(len, ptr_len - copy_off); + + src = flush ? buf : ptr_buf + copy_off; + dst = flush ? ptr_buf + copy_off : buf; + memcpy(dst, src, copy_len); + + off += copy_len; + len -= copy_len; + buf += copy_len; + } + + if (!len || next_frag == end_frag) + break; + + ptr_off += ptr_len; + ptr_buf = skb_frag_address(next_frag); + ptr_len = skb_frag_size(next_frag); + next_frag++; + } +} + +static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + u32 size = xdp->data_end - xdp->data; + void *addr = xdp->data; + int i; + + if (unlikely(offset > 0xffff || len > 0xffff)) + return ERR_PTR(-EFAULT); + + if (offset + len > xdp_get_buff_len(xdp)) + return ERR_PTR(-EINVAL); + + if (offset < size) /* linear area */ + goto out; + + offset -= size; + for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ + u32 frag_size = skb_frag_size(&sinfo->frags[i]); + + if (offset < frag_size) { + addr = skb_frag_address(&sinfo->frags[i]); + size = frag_size; + break; + } + offset -= frag_size; + } +out: + return offset + len <= size ? addr + offset : NULL; +} + +BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, false); + else + memcpy(buf, ptr, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { + .func = bpf_xdp_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, + void *, buf, u32, len) +{ + void *ptr; + + ptr = bpf_xdp_pointer(xdp, offset, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (!ptr) + bpf_xdp_copy_buf(xdp, offset, buf, len, true); + else + memcpy(ptr, buf, len); + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { + .func = bpf_xdp_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; + struct xdp_rxq_info *rxq = xdp->rxq; + unsigned int tailroom; + + if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) + return -EOPNOTSUPP; + + tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); + if (unlikely(offset > tailroom)) + return -EINVAL; + + memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); + skb_frag_size_add(frag, offset); + sinfo->xdp_frags_size += offset; + + return 0; +} + +static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, n_frags_free = 0, len_free = 0; + + if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) + return -EINVAL; + + for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { + skb_frag_t *frag = &sinfo->frags[i]; + int shrink = min_t(int, offset, skb_frag_size(frag)); + + len_free += shrink; + offset -= shrink; + + if (skb_frag_size(frag) == shrink) { + struct page *page = skb_frag_page(frag); + + __xdp_return(page_address(page), &xdp->rxq->mem, + false, NULL); + n_frags_free++; + } else { + skb_frag_size_sub(frag, shrink); + break; + } + } + sinfo->nr_frags -= n_frags_free; + sinfo->xdp_frags_size -= len_free; + + if (unlikely(!sinfo->nr_frags)) { + xdp_buff_clear_frags_flag(xdp); + xdp->data_end -= offset; + } + + return 0; +} + BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; + if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ + if (offset < 0) + return bpf_xdp_frags_shrink_tail(xdp, -offset); + + return bpf_xdp_frags_increase_tail(xdp, offset); + } + /* Notice that xdp_data_hard_end have reserved some tailroom */ if (unlikely(data_end > data_hard_end)) return -EINVAL; @@ -3958,10 +4182,35 @@ u32 xdp_master_redirect(struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_master_redirect); -int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) +static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, + struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; + + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; + + err = __xsk_map_redirect(fwd, xdp); + if (unlikely(err)) + goto err; + + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); + return err; +} + +static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, + struct net_device *dev, + struct xdp_frame *xdpf, + struct bpf_prog *xdp_prog) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; @@ -3971,6 +4220,11 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; + if (unlikely(!xdpf)) { + err = -EOVERFLOW; + goto err; + } + switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; @@ -3978,17 +4232,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, map = READ_ONCE(ri->map); if (unlikely(map)) { WRITE_ONCE(ri->map, NULL); - err = dev_map_enqueue_multi(xdp, dev, map, + err = dev_map_enqueue_multi(xdpf, dev, map, ri->flags & BPF_F_EXCLUDE_INGRESS); } else { - err = dev_map_enqueue(fwd, xdp, dev); + err = dev_map_enqueue(fwd, xdpf, dev); } break; case BPF_MAP_TYPE_CPUMAP: - err = cpu_map_enqueue(fwd, xdp, dev); - break; - case BPF_MAP_TYPE_XSKMAP: - err = __xsk_map_redirect(fwd, xdp); + err = cpu_map_enqueue(fwd, xdpf, dev); break; case BPF_MAP_TYPE_UNSPEC: if (map_id == INT_MAX) { @@ -3997,7 +4248,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, err = -EINVAL; break; } - err = dev_xdp_enqueue(fwd, xdp, dev); + err = dev_xdp_enqueue(fwd, xdpf, dev); break; } fallthrough; @@ -4014,8 +4265,42 @@ err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } + +int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + enum bpf_map_type map_type = ri->map_type; + + /* XDP_REDIRECT is not fully supported yet for xdp frags since + * not all XDP capable drivers can map non-linear xdp_frame in + * ndo_xdp_xmit. + */ + if (unlikely(xdp_buff_has_frags(xdp) && + map_type != BPF_MAP_TYPE_CPUMAP)) + return -EOPNOTSUPP; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), + xdp_prog); +} EXPORT_SYMBOL_GPL(xdp_do_redirect); +int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, + struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + enum bpf_map_type map_type = ri->map_type; + + if (map_type == BPF_MAP_TYPE_XSKMAP) + return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); + + return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); +} +EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); + static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, @@ -4174,7 +4459,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4188,7 +4473,7 @@ const struct bpf_func_proto bpf_skb_output_proto = { .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4205,7 +4490,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key void *to_orig = to; int err; - if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | + BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } @@ -4216,6 +4502,7 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key if (unlikely(size != sizeof(struct bpf_tunnel_key))) { err = -EINVAL; switch (size) { + case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; @@ -4236,15 +4523,22 @@ set_compat: to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - to->tunnel_ext = 0; + if (flags & BPF_F_TUNINFO_FLAGS) + to->tunnel_flags = info->key.tun_flags; + else + to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); + memcpy(to->local_ipv6, &info->key.u.ipv6.dst, + sizeof(to->local_ipv6)); to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); + memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); to->tunnel_label = 0; } @@ -4315,6 +4609,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): @@ -4357,10 +4652,14 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); + memcpy(&info->key.u.ipv6.src, from->local_ipv6, + sizeof(from->local_ipv6)); info->key.label = cpu_to_be32(from->tunnel_label) & IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); + info->key.flow_flags = FLOWI_FLAG_ANYSRC; } return 0; @@ -4371,7 +4670,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -4397,7 +4696,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -4538,10 +4837,12 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { }; #endif -static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, +static unsigned long bpf_xdp_copy(void *dst, const void *ctx, unsigned long off, unsigned long len) { - memcpy(dst_buff, src_buff + off, len); + struct xdp_buff *xdp = (struct xdp_buff *)ctx; + + bpf_xdp_copy_buf(xdp, off, dst, len, false); return 0; } @@ -4552,11 +4853,11 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(!xdp || - xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + + if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) return -EFAULT; - return bpf_event_output(map, flags, meta, meta_size, xdp->data, + return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, bpf_xdp_copy); } @@ -4567,7 +4868,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4581,7 +4882,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = { .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4718,345 +5019,316 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_socket_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + switch (optname) { + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_RCVBUF: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_REUSEPORT: + case SO_RCVLOWAT: + case SO_MARK: + case SO_MAX_PACING_RATE: + case SO_BINDTOIFINDEX: + case SO_TXREHASH: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case SO_BINDTODEVICE: + break; + default: + return -EINVAL; + } + + if (getopt) { + if (optname == SO_BINDTODEVICE) + return -EINVAL; + return sk_getsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + return sk_setsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) { - char devname[IFNAMSIZ]; - int val, valbool; - struct net *net; - int ifindex; - int ret = 0; + struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; + int val; - if (!sk_fullsock(sk)) + if (optlen != sizeof(int)) return -EINVAL; - sock_owned_by_me(sk); + val = *(int *)optval; - if (level == SOL_SOCKET) { - if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > tp->syn_data) return -EINVAL; - val = *((int *)optval); - valbool = val ? 1 : 0; - - /* Only some socketops are supported */ - switch (optname) { - case SO_RCVBUF: - val = min_t(u32, val, sysctl_rmem_max); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); - break; - case SO_SNDBUF: - val = min_t(u32, val, sysctl_wmem_max); - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - WRITE_ONCE(sk->sk_sndbuf, - max_t(int, val * 2, SOCK_MIN_SNDBUF)); - break; - case SO_MAX_PACING_RATE: /* 32bit version */ - if (val != ~0U) - cmpxchg(&sk->sk_pacing_status, - SK_PACING_NONE, - SK_PACING_NEEDED); - sk->sk_max_pacing_rate = (val == ~0U) ? - ~0UL : (unsigned int)val; - sk->sk_pacing_rate = min(sk->sk_pacing_rate, - sk->sk_max_pacing_rate); - break; - case SO_PRIORITY: - sk->sk_priority = val; - break; - case SO_RCVLOWAT: - if (val < 0) - val = INT_MAX; - WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); - break; - case SO_MARK: - if (sk->sk_mark != val) { - sk->sk_mark = val; - sk_dst_reset(sk); - } - break; - case SO_BINDTODEVICE: - optlen = min_t(long, optlen, IFNAMSIZ - 1); - strncpy(devname, optval, optlen); - devname[optlen] = 0; + tcp_snd_cwnd_set(tp, val); + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) + return -EINVAL; + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; + default: + return -EINVAL; + } - ifindex = 0; - if (devname[0] != '\0') { - struct net_device *dev; + return 0; +} - ret = -ENODEV; +static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, + int *optlen, bool getopt) +{ + struct tcp_sock *tp; + int ret; - net = sock_net(sk); - dev = dev_get_by_name(net, devname); - if (!dev) - break; - ifindex = dev->ifindex; - dev_put(dev); - } - fallthrough; - case SO_BINDTOIFINDEX: - if (optname == SO_BINDTOIFINDEX) - ifindex = val; - ret = sock_bindtoindex(sk, ifindex, false); - break; - case SO_KEEPALIVE: - if (sk->sk_prot->keepalive) - sk->sk_prot->keepalive(sk, valbool); - sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); - break; - case SO_REUSEPORT: - sk->sk_reuseport = valbool; - break; - default: - ret = -EINVAL; - } -#ifdef CONFIG_INET - } else if (level == SOL_IP) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET) + if (*optlen < 2) + return -EINVAL; + + if (getopt) { + if (!inet_csk(sk)->icsk_ca_ops) return -EINVAL; + /* BPF expects NULL-terminated tcp-cc string */ + optval[--(*optlen)] = '\0'; + return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct inet_sock *inet = inet_sk(sk); + /* "cdg" is the only cc that alloc a ptr + * in inet_csk_ca area. The bpf-tcp-cc may + * overwrite this ptr after switching to cdg. + */ + if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) + return -ENOTSUPP; - if (val == -1) - val = 0; - inet->tos = val; - } - break; - default: - ret = -EINVAL; - } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) - return -EINVAL; + /* It stops this looping + * + * .init => bpf_setsockopt(tcp_cc) => .init => + * bpf_setsockopt(tcp_cc)" => .init => .... + * + * The second bpf_setsockopt(tcp_cc) is not allowed + * in order to break the loop when both .init + * are the same bpf prog. + * + * This applies even the second bpf_setsockopt(tcp_cc) + * does not cause a loop. This limits only the first + * '.init' can call bpf_setsockopt(TCP_CONGESTION) to + * pick a fallback cc (eg. peer does not support ECN) + * and the second '.init' cannot fallback to + * another. + */ + tp = tcp_sk(sk); + if (tp->bpf_chg_cc_inprogress) + return -EBUSY; + + tp->bpf_chg_cc_inprogress = 1; + ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), *optlen); + tp->bpf_chg_cc_inprogress = 0; + return ret; +} - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct ipv6_pinfo *np = inet6_sk(sk); +static int sol_tcp_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_prot->setsockopt != tcp_setsockopt) + return -EINVAL; - if (val == -1) - val = 0; - np->tclass = val; - } - break; - default: - ret = -EINVAL; - } -#endif - } else if (level == SOL_TCP && - sk->sk_prot->setsockopt == tcp_setsockopt) { - if (optname == TCP_CONGESTION) { - char name[TCP_CA_NAME_MAX]; - - strncpy(name, optval, min_t(long, optlen, - TCP_CA_NAME_MAX-1)); - name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, true); - } else { - struct inet_connection_sock *icsk = inet_csk(sk); + switch (optname) { + case TCP_NODELAY: + case TCP_MAXSEG: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + case TCP_THIN_LINEAR_TIMEOUTS: + case TCP_USER_TIMEOUT: + case TCP_NOTSENT_LOWAT: + case TCP_SAVE_SYN: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case TCP_CONGESTION: + return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); + case TCP_SAVED_SYN: + if (*optlen < 1) + return -EINVAL; + break; + default: + if (getopt) + return -EINVAL; + return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); + } + + if (getopt) { + if (optname == TCP_SAVED_SYN) { struct tcp_sock *tp = tcp_sk(sk); - unsigned long timeout; - if (optlen != sizeof(int)) + if (!tp->saved_syn || + *optlen > tcp_saved_syn_len(tp->saved_syn)) return -EINVAL; - - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case TCP_BPF_IW: - if (val <= 0 || tp->data_segs_out > tp->syn_data) - ret = -EINVAL; - else - tp->snd_cwnd = val; - break; - case TCP_BPF_SNDCWND_CLAMP: - if (val <= 0) { - ret = -EINVAL; - } else { - tp->snd_cwnd_clamp = val; - tp->snd_ssthresh = val; - } - break; - case TCP_BPF_DELACK_MAX: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_DELACK_MAX || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_delack_max = timeout; - break; - case TCP_BPF_RTO_MIN: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_RTO_MIN || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_rto_min = timeout; - break; - case TCP_SAVE_SYN: - if (val < 0 || val > 1) - ret = -EINVAL; - else - tp->save_syn = val; - break; - case TCP_KEEPIDLE: - ret = tcp_sock_set_keepidle_locked(sk, val); - break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) - ret = -EINVAL; - else - tp->keepalive_intvl = val * HZ; - break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - ret = -EINVAL; - else - tp->keepalive_probes = val; - break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - ret = -EINVAL; - else - icsk->icsk_syn_retries = val; - break; - case TCP_USER_TIMEOUT: - if (val < 0) - ret = -EINVAL; - else - icsk->icsk_user_timeout = val; - break; - case TCP_NOTSENT_LOWAT: - tp->notsent_lowat = val; - sk->sk_write_space(sk); - break; - case TCP_WINDOW_CLAMP: - ret = tcp_set_window_clamp(sk, val); - break; - default: - ret = -EINVAL; - } + memcpy(optval, tp->saved_syn->data, *optlen); + /* It cannot free tp->saved_syn here because it + * does not know if the user space still needs it. + */ + return 0; } -#endif - } else { - ret = -EINVAL; + + return do_tcp_getsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); } - return ret; + + return do_tcp_setsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), *optlen); } -static int _bpf_getsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_ip_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { - if (!sk_fullsock(sk)) - goto err_clear; + if (sk->sk_family != AF_INET) + return -EINVAL; - sock_owned_by_me(sk); + switch (optname) { + case IP_TOS: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } - if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) - goto err_clear; + if (getopt) + return do_ip_getsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); - switch (optname) { - case SO_MARK: - *((int *)optval) = sk->sk_mark; - break; - case SO_PRIORITY: - *((int *)optval) = sk->sk_priority; - break; - case SO_BINDTOIFINDEX: - *((int *)optval) = sk->sk_bound_dev_if; - break; - case SO_REUSEPORT: - *((int *)optval) = sk->sk_reuseport; - break; - default: - goto err_clear; - } -#ifdef CONFIG_INET - } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { - struct inet_connection_sock *icsk; - struct tcp_sock *tp; + return do_ip_setsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), *optlen); +} - switch (optname) { - case TCP_CONGESTION: - icsk = inet_csk(sk); +static int sol_ipv6_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_family != AF_INET6) + return -EINVAL; - if (!icsk->icsk_ca_ops || optlen <= 1) - goto err_clear; - strncpy(optval, icsk->icsk_ca_ops->name, optlen); - optval[optlen - 1] = 0; - break; - case TCP_SAVED_SYN: - tp = tcp_sk(sk); + switch (optname) { + case IPV6_TCLASS: + case IPV6_AUTOFLOWLABEL: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } - if (optlen <= 0 || !tp->saved_syn || - optlen > tcp_saved_syn_len(tp->saved_syn)) - goto err_clear; - memcpy(optval, tp->saved_syn->data, optlen); - break; - default: - goto err_clear; - } - } else if (level == SOL_IP) { - struct inet_sock *inet = inet_sk(sk); + if (getopt) + return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); - if (optlen != sizeof(int) || sk->sk_family != AF_INET) - goto err_clear; + return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), *optlen); +} - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - *((int *)optval) = (int)inet->tos; - break; - default: - goto err_clear; - } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { - struct ipv6_pinfo *np = inet6_sk(sk); +static int __bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (!sk_fullsock(sk)) + return -EINVAL; - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) - goto err_clear; + if (level == SOL_SOCKET) + return sol_socket_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + return sol_ip_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + return sol_tcp_sockopt(sk, optname, optval, &optlen, false); - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - *((int *)optval) = (int)np->tclass; - break; - default: - goto err_clear; - } -#endif -#endif - } else { - goto err_clear; - } - return 0; -err_clear: - memset(optval, 0, optlen); return -EINVAL; } -BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, - int, optname, char *, optval, int, optlen) +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { - if (level == SOL_TCP && optname == TCP_CONGESTION) { - if (optlen >= sizeof("cdg") - 1 && - !strncmp("cdg", optval, optlen)) - return -ENOTSUPP; + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static int __bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + int err, saved_optlen = optlen; + + if (!sk_fullsock(sk)) { + err = -EINVAL; + goto done; } + if (level == SOL_SOCKET) + err = sol_socket_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + err = sol_ip_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); + else + err = -EINVAL; + +done: + if (err) + optlen = 0; + if (optlen < saved_optlen) + memset(optval + optlen, 0, saved_optlen - optlen); + return err; +} + +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ return _bpf_setsockopt(sk, level, optname, optval, optlen); } @@ -5067,7 +5339,7 @@ const struct bpf_func_proto bpf_sk_setsockopt_proto = { .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5088,6 +5360,40 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { + .func = bpf_unlocked_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { + .func = bpf_unlocked_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { @@ -5101,7 +5407,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5135,7 +5441,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5310,7 +5616,7 @@ static const struct bpf_func_proto bpf_bind_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -5846,7 +6152,6 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len if (err) return err; - ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); return seg6_lookup_nexthop(skb, NULL, 0); @@ -5898,7 +6203,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5908,7 +6213,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5951,7 +6256,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -6039,7 +6344,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -6107,6 +6412,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { + struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; @@ -6115,7 +6421,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, + sk = __inet_lookup(net, hinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); @@ -6129,7 +6435,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, + sk = __inet6_lookup(net, hinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); @@ -6151,8 +6457,6 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, /* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. - * Returns the socket as an 'unsigned long' to simplify the casting in the - * callers to satisfy BPF_CALL declarations. */ static struct sock * __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, @@ -6160,8 +6464,8 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u64 flags) { struct sock *sk = NULL; - u8 family = AF_UNSPEC; struct net *net; + u8 family; int sdif; if (len == sizeof(tuple->ipv4)) @@ -6171,8 +6475,7 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, else return NULL; - if (unlikely(family == AF_UNSPEC || flags || - !((s32)netns_id < 0 || netns_id <= S32_MAX))) + if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; if (family == AF_INET) @@ -6204,10 +6507,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, ifindex, proto, netns_id, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } @@ -6241,10 +6555,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, flags); if (sk) { - sk = sk_to_full_sk(sk); - if (!sk_fullsock(sk)) { + struct sock *sk2 = sk_to_full_sk(sk); + + /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk + * sock refcnt is decremented to prevent a request_sock leak. + */ + if (!sk_fullsock(sk2)) + sk2 = NULL; + if (sk2 != sk) { sock_gen_put(sk); - return NULL; + /* Ensure there is no need to bump sk2 refcnt */ + if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + return NULL; + } + sk = sk2; } } @@ -6264,7 +6589,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6283,7 +6608,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6302,7 +6627,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6319,7 +6644,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, @@ -6339,7 +6664,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6362,7 +6687,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6385,7 +6710,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6404,7 +6729,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6423,7 +6748,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6442,7 +6767,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6708,30 +7033,39 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -EINVAL; if (!th->ack || th->rst || th->syn) return -ENOENT; + if (unlikely(iph_len < sizeof(struct iphdr))) + return -EINVAL; + if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; cookie = ntohl(th->ack_seq) - 1; - switch (sk->sk_family) { - case AF_INET: - if (unlikely(iph_len < sizeof(struct iphdr))) + /* Both struct iphdr and struct ipv6hdr have the version field at the + * same offset so we can cast to the shorter header (struct iphdr). + */ + switch (((struct iphdr *)iph)->version) { + case 4: + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th, cookie); break; #if IS_BUILTIN(CONFIG_IPV6) - case AF_INET6: + case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; + if (sk->sk_family != AF_INET6) + return -EINVAL; + ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie); break; #endif /* CONFIG_IPV6 */ @@ -6755,9 +7089,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -6774,7 +7108,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; - if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies) + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -ENOENT; if (!th->syn || th->ack || th->fin || th->rst) @@ -6788,7 +7122,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, */ switch (((struct iphdr *)iph)->version) { case 4: - if (sk->sk_family == AF_INET6 && sk->sk_ipv6only) + if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); @@ -6824,9 +7158,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -7055,7 +7389,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -7086,6 +7420,151 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, + u64, tstamp, u32, tstamp_type) +{ + /* skb_clear_delivery_time() is done for inet protocol */ + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -EOPNOTSUPP; + + switch (tstamp_type) { + case BPF_SKB_TSTAMP_DELIVERY_MONO: + if (!tstamp) + return -EINVAL; + skb->tstamp = tstamp; + skb->mono_delivery_time = 1; + break; + case BPF_SKB_TSTAMP_UNSPEC: + if (tstamp) + return -EINVAL; + skb->tstamp = 0; + skb->mono_delivery_time = 0; + break; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { + .func = bpf_skb_set_tstamp, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +#ifdef CONFIG_SYN_COOKIES +BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, + struct tcphdr *, th, u32, th_len) +{ + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; + cookie = __cookie_v4_init_sequence(iph, th, &mss); + + return cookie | ((u64)mss << 32); +} + +static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { + .func = bpf_tcp_raw_gen_syncookie_ipv4, + .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct iphdr), + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, + struct tcphdr *, th, u32, th_len) +{ +#if IS_BUILTIN(CONFIG_IPV6) + const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - + sizeof(struct ipv6hdr); + u32 cookie; + u16 mss; + + if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + return -EINVAL; + + mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; + cookie = __cookie_v6_init_sequence(iph, th, &mss); + + return cookie | ((u64)mss << 32); +#else + return -EPROTONOSUPPORT; +#endif +} + +static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { + .func = bpf_tcp_raw_gen_syncookie_ipv6, + .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct ipv6hdr), + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, + struct tcphdr *, th) +{ + u32 cookie = ntohl(th->ack_seq) - 1; + + if (__cookie_v4_check(iph, th, cookie) > 0) + return 0; + + return -EACCES; +} + +static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { + .func = bpf_tcp_raw_check_syncookie_ipv4, + .gpl_only = true, /* __cookie_v4_check is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct iphdr), + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_size = sizeof(struct tcphdr), +}; + +BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, + struct tcphdr *, th) +{ +#if IS_BUILTIN(CONFIG_IPV6) + u32 cookie = ntohl(th->ack_seq) - 1; + + if (__cookie_v6_check(iph, th, cookie) > 0) + return 0; + + return -EACCES; +#else + return -EPROTONOSUPPORT; +#endif +} + +static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { + .func = bpf_tcp_raw_check_syncookie_ipv6, + .gpl_only = true, /* __cookie_v6_check is GPL */ + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_size = sizeof(struct ipv6hdr), + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_size = sizeof(struct tcphdr), +}; +#endif /* CONFIG_SYN_COOKIES */ + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -7132,34 +7611,23 @@ const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: @@ -7172,12 +7640,17 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: @@ -7190,24 +7663,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; @@ -7288,9 +7745,13 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -7447,6 +7908,18 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; + case BPF_FUNC_skb_set_tstamp: + return &bpf_skb_set_tstamp_proto; +#ifdef CONFIG_SYN_COOKIES + case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: + return &bpf_tcp_raw_gen_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: + return &bpf_tcp_raw_gen_syncookie_ipv6_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv4: + return &bpf_tcp_raw_check_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv6: + return &bpf_tcp_raw_check_syncookie_ipv6_proto; +#endif #endif default: return bpf_sk_base_func_proto(func_id); @@ -7473,6 +7946,12 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_xdp_get_buff_len: + return &bpf_xdp_get_buff_len_proto; + case BPF_FUNC_xdp_load_bytes: + return &bpf_xdp_load_bytes_proto; + case BPF_FUNC_xdp_store_bytes: + return &bpf_xdp_store_bytes_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; case BPF_FUNC_check_mtu: @@ -7490,6 +7969,16 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; +#ifdef CONFIG_SYN_COOKIES + case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: + return &bpf_tcp_raw_gen_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: + return &bpf_tcp_raw_gen_syncookie_ipv6_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv4: + return &bpf_tcp_raw_check_syncookie_ipv4_proto; + case BPF_FUNC_tcp_raw_check_syncookie_ipv6: + return &bpf_tcp_raw_check_syncookie_ipv6_proto; +#endif #endif default: return bpf_sk_base_func_proto(func_id); @@ -7502,6 +7991,12 @@ const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; @@ -7515,8 +8010,6 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: @@ -7780,7 +8273,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; - case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1: + case offsetof(struct __sk_buff, tstamp_type): + return false; + case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: @@ -7970,6 +8465,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); + int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; @@ -7981,7 +8477,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): - case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): @@ -7990,6 +8485,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case bpf_ctx_range(struct bpf_sock, dst_port): + field_size = size == size_default ? + size_default : sizeof_field(struct bpf_sock, dst_port); + bpf_ctx_record_field_size(info, field_size); + return bpf_ctx_narrow_access_ok(off, size, field_size); + case offsetofend(struct bpf_sock, dst_port) ... + offsetof(struct bpf_sock, dst_ip4) - 1: + return false; } return size == size_default; @@ -8029,7 +8532,7 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, * (Fast-path, otherwise approximation that we might be * a clone, do the rest in helper.) */ - *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); @@ -8127,11 +8630,50 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; + case offsetof(struct __sk_buff, tstamp_type): + /* The convert_ctx_access() on reading and writing + * __sk_buff->tstamp depends on whether the bpf prog + * has used __sk_buff->tstamp_type or not. + * Thus, we need to set prog->tstamp_type_access + * earlier during is_valid_access() here. + */ + ((struct bpf_prog *)prog)->tstamp_type_access = 1; + return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); } +DEFINE_MUTEX(nf_conn_btf_access_lock); +EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); + +int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); +EXPORT_SYMBOL_GPL(nfct_btf_struct_access); + +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -8181,16 +8723,37 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -void bpf_warn_invalid_xdp_action(u32 act) +void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; - WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n", - act > act_max ? "Illegal" : "Driver unsupported", - act); + pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", + act > act_max ? "Illegal" : "Driver unsupported", + act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static int xdp_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -8522,6 +9085,25 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + /* AX is needed because src_reg and dst_reg could be the same */ + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + PKT_VLAN_PRESENT_OFFSET); + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK, 2); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); + *insn++ = BPF_JMP_A(1); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); + + return insn; +} + static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, struct bpf_insn *insn) { @@ -8543,6 +9125,74 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, return insn; } +static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + +#ifdef CONFIG_NET_CLS_ACT + /* If the tstamp_type is read, + * the bpf prog is aware the tstamp could have delivery time. + * Thus, read skb->tstamp as is if tstamp_type_access is true. + */ + if (!prog->tstamp_type_access) { + /* AX is needed because src_reg and dst_reg could be the same */ + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, + TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); + /* skb->tc_at_ingress && skb->mono_delivery_time, + * read 0 as the (rcv) timestamp. + */ + *insn++ = BPF_MOV64_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); + } +#endif + + *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + +static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, + const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->src_reg; + __u8 skb_reg = si->dst_reg; + +#ifdef CONFIG_NET_CLS_ACT + /* If the tstamp_type is read, + * the bpf prog is aware the tstamp could have delivery time. + * Thus, write skb->tstamp as is if tstamp_type_access is true. + * Otherwise, writing at ingress will have to clear the + * mono_delivery_time bit also. + */ + if (!prog->tstamp_type_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); + /* Writing __sk_buff->tstamp as ingress, goto <clear> */ + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); + /* goto <store> */ + *insn++ = BPF_JMP_A(2); + /* <clear>: mono_delivery_time */ + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, PKT_VLAN_PRESENT_OFFSET); + } +#endif + + /* <store>: skb->tstamp = tstamp */ + *insn++ = BPF_STX_MEM(BPF_DW, skb_reg, value_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -8617,7 +9267,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, pkt_type): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, - PKT_TYPE_OFFSET()); + PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); @@ -8642,7 +9292,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, vlan_present): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, - PKT_VLAN_PRESENT_OFFSET()); + PKT_VLAN_PRESENT_OFFSET); if (PKT_VLAN_PRESENT_BIT) *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); if (PKT_VLAN_PRESENT_BIT < 7) @@ -8851,17 +9501,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); + insn = bpf_convert_tstamp_write(prog, si, insn); else - *insn++ = BPF_LDX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); + insn = bpf_convert_tstamp_read(prog, si, insn); + break; + + case offsetof(struct __sk_buff, tstamp_type): + insn = bpf_convert_tstamp_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): @@ -10002,7 +10648,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, - .check_kfunc_call = bpf_prog_test_check_kfunc_call, + .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -10014,6 +10660,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, + .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { @@ -10148,14 +10795,13 @@ int sk_detach_filter(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_detach_filter); -int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, - unsigned int len) +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; - lock_sock(sk); + sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) @@ -10180,7 +10826,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; ret = -EFAULT; - if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) + if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number @@ -10188,7 +10834,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, */ ret = fprog->len; out: - release_sock(sk); + sockopt_release_sock(sk); return ret; } @@ -10541,11 +11187,24 @@ static bool sk_lookup_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): - case bpf_ctx_range(struct bpf_sk_lookup, remote_port): case bpf_ctx_range(struct bpf_sk_lookup, local_port): + case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): bpf_ctx_record_field_size(info, sizeof(__u32)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); + case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + /* Allow 4-byte access to 2-byte field for backward compatibility */ + if (size == sizeof(__u32)) + return true; + bpf_ctx_record_field_size(info, sizeof(__be16)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); + + case offsetofend(struct bpf_sk_lookup, remote_port) ... + offsetof(struct bpf_sk_lookup, local_ip4) - 1: + /* Allow access to zero padding for backward compatibility */ + bpf_ctx_record_field_size(info, sizeof(__u16)); + return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); + default: return false; } @@ -10627,11 +11286,22 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, sport, 2, target_size)); break; + case offsetofend(struct bpf_sk_lookup, remote_port): + *target_size = 2; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + break; + case offsetof(struct bpf_sk_lookup, local_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, dport, 2, target_size)); break; + + case offsetof(struct bpf_sk_lookup, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sk_lookup_kern, + ingress_ifindex, 4, target_size)); + break; } return insn - insn_buf; @@ -10656,14 +11326,10 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } -#ifdef CONFIG_DEBUG_INFO_BTF -BTF_ID_LIST_GLOBAL(btf_sock_ids) +BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE -#else -u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; -#endif BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { @@ -10795,6 +11461,20 @@ const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], }; +BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) +{ + BTF_TYPE_EMIT(struct mptcp_sock); + return (unsigned long)bpf_mptcp_sock_from_subflow(sk); +} + +const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { + .func = bpf_skc_to_mptcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], +}; + BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); @@ -10837,6 +11517,9 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; + case BPF_FUNC_skc_to_mptcp_sock: + func = &bpf_skc_to_mptcp_sock_proto; + break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 3255f57f5131..25cd35f5922e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -5,6 +5,7 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> +#include <linux/filter.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include <net/ip.h> @@ -21,6 +22,7 @@ #include <linux/ppp_defs.h> #include <linux/stddef.h> #include <linux/if_ether.h> +#include <linux/if_hsr.h> #include <linux/mpls.h> #include <linux/tcp.h> #include <linux/ptp_classify.h> @@ -202,6 +204,30 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } +static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_l2tpv3 *key_l2tpv3; + struct { + __be32 session_id; + } *hdr, _hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_l2tpv3 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_L2TPV3, + target_container); + + key_l2tpv3->session_id = hdr->session_id; +} + void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -238,7 +264,7 @@ void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, - size_t mapsize, bool post_ct) + size_t mapsize, bool post_ct, u16 zone) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; @@ -260,6 +286,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb, if (!ct) { key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_INVALID; + key->ct_zone = zone; return; } @@ -863,8 +890,8 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } -bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen, unsigned int flags) +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; @@ -889,7 +916,12 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); - return result == BPF_OK; + return result; +} + +static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) +{ + return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0; } /** @@ -1000,6 +1032,7 @@ bool __skb_flow_dissect(const struct net *net, }; __be16 n_proto = proto; struct bpf_prog *prog; + u32 result; if (skb) { ctx.skb = skb; @@ -1011,13 +1044,16 @@ bool __skb_flow_dissect(const struct net *net, } prog = READ_ONCE(run_array->items[0].prog); - ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, - hlen, flags); + result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, + hlen, flags); + if (result == BPF_FLOW_DISSECTOR_CONTINUE) + goto dissect_continue; __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); - return ret; + return result == BPF_OK; } +dissect_continue: rcu_read_unlock(); } @@ -1029,7 +1065,17 @@ bool __skb_flow_dissect(const struct net *net, key_eth_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS, target_container); - memcpy(key_eth_addrs, ð->h_dest, sizeof(*key_eth_addrs)); + memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs)); + } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { + struct flow_dissector_key_num_of_vlans *key_num_of_vlans; + + key_num_of_vlans = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, + target_container); + key_num_of_vlans->num_of_vlans = 0; } proto_again: @@ -1155,6 +1201,16 @@ proto_again: nhoff += sizeof(*vlan); } + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && + !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { + struct flow_dissector_key_num_of_vlans *key_nvs; + + key_nvs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_NUM_OF_VLANS, + target_container); + key_nvs->num_of_vlans++; + } + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { @@ -1180,6 +1236,7 @@ proto_again: VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } key_vlan->vlan_tpid = saved_vlan_tpid; + key_vlan->vlan_eth_type = proto; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; @@ -1190,26 +1247,60 @@ proto_again: struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; + u16 ppp_proto; + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } - nhoff += PPPOE_SES_HLEN; - switch (hdr->proto) { - case htons(PPP_IP): + if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + /* least significant bit of the most significant octet + * indicates if protocol field was compressed + */ + ppp_proto = ntohs(hdr->proto); + if (ppp_proto & 0x0100) { + ppp_proto = ppp_proto >> 8; + nhoff += PPPOE_SES_HLEN - 1; + } else { + nhoff += PPPOE_SES_HLEN; + } + + if (ppp_proto == PPP_IP) { proto = htons(ETH_P_IP); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; - break; - case htons(PPP_IPV6): + } else if (ppp_proto == PPP_IPV6) { proto = htons(ETH_P_IPV6); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; - break; - default: + } else if (ppp_proto == PPP_MPLS_UC) { + proto = htons(ETH_P_MPLS_UC); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto == PPP_MPLS_MC) { + proto = htons(ETH_P_MPLS_MC); + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + } else if (ppp_proto_is_valid(ppp_proto)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + } else { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PPPOE)) { + struct flow_dissector_key_pppoe *key_pppoe; + + key_pppoe = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PPPOE, + target_container); + key_pppoe->session_id = hdr->hdr.sid; + key_pppoe->ppp_proto = htons(ppp_proto); + key_pppoe->type = htons(ETH_P_PPP_SES); + } break; } case htons(ETH_P_TIPC): { @@ -1280,6 +1371,23 @@ proto_again: break; } + case htons(ETH_P_PRP): + case htons(ETH_P_HSR): { + struct hsr_tag *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = hdr->encap_proto; + nhoff += HSR_HLEN; + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; @@ -1417,6 +1525,10 @@ ip_proto_again: __skb_flow_dissect_icmp(skb, flow_dissector, target_container, data, nhoff, hlen); break; + case IPPROTO_L2TP: + __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; @@ -1460,7 +1572,7 @@ out_bad: } EXPORT_SYMBOL(__skb_flow_dissect); -static siphash_key_t hashrnd __read_mostly; +static siphash_aligned_key_t hashrnd; static __always_inline void __flow_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); @@ -1531,9 +1643,8 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) switch (keys->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: - addr_diff = (__force u32)keys->addrs.v4addrs.dst - - (__force u32)keys->addrs.v4addrs.src; - if (addr_diff < 0) + if ((__force u32)keys->addrs.v4addrs.dst < + (__force u32)keys->addrs.v4addrs.src) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); if ((__force u16)keys->ports.dst < diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 6beaea13564a..abe423fd5736 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/kernel.h> #include <linux/slab.h> +#include <net/act_api.h> #include <net/flow_offload.h> #include <linux/rtnetlink.h> #include <linux/mutex.h> @@ -27,6 +28,26 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) } EXPORT_SYMBOL(flow_rule_alloc); +struct flow_offload_action *offload_action_alloc(unsigned int num_actions) +{ + struct flow_offload_action *fl_action; + int i; + + fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions), + GFP_KERNEL); + if (!fl_action) + return NULL; + + fl_action->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + fl_action->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; + + return fl_action; +} + #define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \ const struct flow_match *__m = &(__rule)->match; \ struct flow_dissector *__d = (__m)->dissector; \ @@ -104,6 +125,13 @@ void flow_rule_match_ports(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_ports); +void flow_rule_match_ports_range(const struct flow_rule *rule, + struct flow_match_ports_range *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS_RANGE, out); +} +EXPORT_SYMBOL(flow_rule_match_ports_range); + void flow_rule_match_tcp(const struct flow_rule *rule, struct flow_match_tcp *out) { @@ -202,6 +230,20 @@ void flow_rule_match_ct(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_ct); +void flow_rule_match_pppoe(const struct flow_rule *rule, + struct flow_match_pppoe *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PPPOE, out); +} +EXPORT_SYMBOL(flow_rule_match_pppoe); + +void flow_rule_match_l2tpv3(const struct flow_rule *rule, + struct flow_match_l2tpv3 *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out); +} +EXPORT_SYMBOL(flow_rule_match_l2tpv3); + struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)) @@ -397,6 +439,8 @@ int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) existing_qdiscs_register(cb, cb_priv); mutex_unlock(&flow_indr_block_lock); + tcf_action_reoffload_cb(cb, cb_priv, true); + return 0; } EXPORT_SYMBOL(flow_indr_dev_register); @@ -449,6 +493,7 @@ void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, __flow_block_indr_cleanup(release, cb_priv, &cleanup_list); mutex_unlock(&flow_indr_block_lock); + tcf_action_reoffload_cb(cb, cb_priv, false); flow_block_indr_notify(&cleanup_list); kfree(indr_dev); } @@ -549,19 +594,31 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, void (*cleanup)(struct flow_block_cb *block_cb)) { struct flow_indr_dev *this; + u32 count = 0; + int err; mutex_lock(&flow_indr_block_lock); + if (bo) { + if (bo->command == FLOW_BLOCK_BIND) + indir_dev_add(data, dev, sch, type, cleanup, bo); + else if (bo->command == FLOW_BLOCK_UNBIND) + indir_dev_remove(data); + } - if (bo->command == FLOW_BLOCK_BIND) - indir_dev_add(data, dev, sch, type, cleanup, bo); - else if (bo->command == FLOW_BLOCK_UNBIND) - indir_dev_remove(data); - - list_for_each_entry(this, &flow_block_indr_dev_list, list) - this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); + list_for_each_entry(this, &flow_block_indr_dev_list, list) { + err = this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); + if (!err) + count++; + } mutex_unlock(&flow_indr_block_lock); - return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; + return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count; } EXPORT_SYMBOL(flow_indr_dev_setup_offload); + +bool flow_indr_dev_exists(void) +{ + return !list_empty(&flow_block_indr_dev_list); +} +EXPORT_SYMBOL(flow_indr_dev_exists); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index a10335b4ba2d..c8d137ef5980 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -345,7 +345,7 @@ static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); - qstats->qlen += qcpu->backlog; + qstats->qlen += qcpu->qlen; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; diff --git a/net/core/gro.c b/net/core/gro.c new file mode 100644 index 000000000000..bc9451743307 --- /dev/null +++ b/net/core/gro.c @@ -0,0 +1,805 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <net/gro.h> +#include <net/dst_metadata.h> +#include <net/busy_poll.h> +#include <trace/events/net.h> + +#define MAX_GRO_SKBS 8 + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + +static DEFINE_SPINLOCK(offload_lock); +static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); +/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ +int gro_normal_batch __read_mostly = 8; + +/** + * dev_add_offload - register offload handlers + * @po: protocol offload declaration + * + * Add protocol offload handlers to the networking stack. The passed + * &proto_offload is linked into kernel lists and may not be freed until + * it has been removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ + struct packet_offload *elem; + + spin_lock(&offload_lock); + list_for_each_entry(elem, &offload_base, list) { + if (po->priority < elem->priority) + break; + } + list_add_rcu(&po->list, elem->list.prev); + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + * __dev_remove_offload - remove offload handler + * @po: packet offload declaration + * + * Remove a protocol offload handler that was previously added to the + * kernel offload handlers by dev_add_offload(). The passed &offload_type + * is removed from the kernel lists and can be freed or reused once this + * function returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +static void __dev_remove_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + struct packet_offload *po1; + + spin_lock(&offload_lock); + + list_for_each_entry(po1, head, list) { + if (po == po1) { + list_del_rcu(&po->list); + goto out; + } + } + + pr_warn("dev_remove_offload: %p not found\n", po); +out: + spin_unlock(&offload_lock); +} + +/** + * dev_remove_offload - remove packet offload handler + * @po: packet offload declaration + * + * Remove a packet offload handler that was previously added to the kernel + * offload handlers by dev_add_offload(). The passed &offload_type is + * removed from the kernel lists and can be freed or reused once this + * function returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ + __dev_remove_offload(po); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); + +/** + * skb_eth_gso_segment - segmentation handler for ethernet protocols. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * @type: Ethernet Protocol ID + */ +struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, + netdev_features_t features, __be16 type) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + return segs; +} +EXPORT_SYMBOL(skb_eth_gso_segment); + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); + +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) +{ + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); + unsigned int offset = skb_gro_offset(skb); + unsigned int headlen = skb_headlen(skb); + unsigned int len = skb_gro_len(skb); + unsigned int delta_truesize; + unsigned int gro_max_size; + unsigned int new_truesize; + struct sk_buff *lp; + int segs; + + /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ + gro_max_size = READ_ONCE(p->dev->gro_max_size); + + if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) + return -E2BIG; + + if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { + if (p->protocol != htons(ETH_P_IPV6) || + skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || + ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + p->encapsulation) + return -E2BIG; + } + + segs = NAPI_GRO_CB(skb)->count; + lp = NAPI_GRO_CB(p)->last; + pinfo = skb_shinfo(lp); + + if (headlen <= offset) { + skb_frag_t *frag; + skb_frag_t *frag2; + int i = skbinfo->nr_frags; + int nr_frags = pinfo->nr_frags + i; + + if (nr_frags > MAX_SKB_FRAGS) + goto merge; + + offset -= headlen; + pinfo->nr_frags = nr_frags; + skbinfo->nr_frags = 0; + + frag = pinfo->frags + nr_frags; + frag2 = skbinfo->frags + i; + do { + *--frag = *--frag2; + } while (--i); + + skb_frag_off_add(frag, offset); + skb_frag_size_sub(frag, offset); + + /* all fragments truesize : remove (head size + sk_buff) */ + new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); + delta_truesize = skb->truesize - new_truesize; + + skb->truesize = new_truesize; + skb->len -= skb->data_len; + skb->data_len = 0; + + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; + goto done; + } else if (skb->head_frag) { + int nr_frags = pinfo->nr_frags; + skb_frag_t *frag = pinfo->frags + nr_frags; + struct page *page = virt_to_head_page(skb->head); + unsigned int first_size = headlen - offset; + unsigned int first_offset; + + if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) + goto merge; + + first_offset = skb->data - + (unsigned char *)page_address(page) + + offset; + + pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; + + __skb_frag_set_page(frag, page); + skb_frag_off_set(frag, first_offset); + skb_frag_size_set(frag, first_size); + + memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); + /* We dont need to clear skbinfo->nr_frags here */ + + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + delta_truesize = skb->truesize - new_truesize; + skb->truesize = new_truesize; + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; + goto done; + } + +merge: + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + delta_truesize = skb->truesize; + if (offset > headlen) { + unsigned int eat = offset - headlen; + + skb_frag_off_add(&skbinfo->frags[0], eat); + skb_frag_size_sub(&skbinfo->frags[0], eat); + skb->data_len -= eat; + skb->len -= eat; + offset = headlen; + } + + __skb_pull(skb, offset); + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + NAPI_GRO_CB(p)->last = skb; + __skb_header_release(skb); + lp = p; + +done: + NAPI_GRO_CB(p)->count += segs; + p->data_len += len; + p->truesize += delta_truesize; + p->len += len; + if (lp != p) { + lp->data_len += len; + lp->truesize += delta_truesize; + lp->len += len; + } + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} + + +static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +{ + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct list_head *head = &offload_base; + int err = -ENOENT; + + BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + + if (NAPI_GRO_CB(skb)->count == 1) { + skb_shinfo(skb)->gso_size = 0; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return; + } + +out: + gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); +} + +static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, + bool flush_old) +{ + struct list_head *head = &napi->gro_hash[index].list; + struct sk_buff *skb, *p; + + list_for_each_entry_safe_reverse(skb, p, head, list) { + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + skb_list_del_init(skb); + napi_gro_complete(napi, skb); + napi->gro_hash[index].count--; + } + + if (!napi->gro_hash[index].count) + __clear_bit(index, &napi->gro_bitmask); +} + +/* napi->gro_hash[].list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; + + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); + } +} +EXPORT_SYMBOL(napi_gro_flush); + +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) +{ + unsigned int maclen = skb->dev->hard_header_len; + u32 hash = skb_get_hash_raw(skb); + struct sk_buff *p; + + list_for_each_entry(p, head, list) { + unsigned long diffs; + + NAPI_GRO_CB(p)->flush = 0; + + if (hash != skb_get_hash_raw(p)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); + diffs |= skb_metadata_differs(p, skb); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_mac_header(skb), + maclen); + + /* in most common scenarions 'slow_gro' is 0 + * otherwise we are already on some slower paths + * either skip all the infrequent tests altogether or + * avoid trying too hard to skip each of them individually + */ + if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; +#endif + + diffs |= p->sk != skb->sk; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + } + + NAPI_GRO_CB(p)->same_flow = !diffs; + } +} + +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) +{ + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (!skb_headlen(skb) && pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0)) && + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); + } +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + skb_frag_off_add(&pinfo->frags[0], grow); + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); + } +} + +static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) +{ + struct sk_buff *oldest; + + oldest = list_last_entry(head, struct sk_buff, list); + + /* We are called with head length >= MAX_GRO_SKBS, so this is + * impossible. + */ + if (WARN_ON_ONCE(!oldest)) + return; + + /* Do not adjust napi->gro_hash[].count, caller is adding a new + * SKB to the chain. + */ + skb_list_del_init(oldest); + napi_gro_complete(napi, oldest); +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct gro_list *gro_list = &napi->gro_hash[bucket]; + struct list_head *head = &offload_base; + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct sk_buff *pp = NULL; + enum gro_result ret; + int same_flow; + int grow; + + if (netif_elide_gro(skb->dev)) + goto normal; + + gro_list_prepare(&gro_list->list, skb); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + + skb_set_network_header(skb, skb_gro_offset(skb)); + skb_reset_mac_len(skb); + BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), + sizeof(u32))); /* Avoid slow unaligned acc */ + *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); + NAPI_GRO_CB(skb)->is_atomic = 1; + NAPI_GRO_CB(skb)->count = 1; + if (unlikely(skb_is_gso(skb))) { + NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; + /* Only support TCP at the moment. */ + if (!skb_is_gso_tcp(skb)) + NAPI_GRO_CB(skb)->flush = 1; + } + + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + break; + } + + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + &gro_list->list, skb); + break; + } + rcu_read_unlock(); + + if (&ptype->list == head) + goto normal; + + if (PTR_ERR(pp) == -EINPROGRESS) { + ret = GRO_CONSUMED; + goto ok; + } + + same_flow = NAPI_GRO_CB(skb)->same_flow; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + + if (pp) { + skb_list_del_init(pp); + napi_gro_complete(napi, pp); + gro_list->count--; + } + + if (same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush) + goto normal; + + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) + gro_flush_oldest(napi, &gro_list->list); + else + gro_list->count++; + + NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; + if (!skb_is_gso(skb)) + skb_shinfo(skb)->gso_size = skb_gro_len(skb); + list_add(&skb->list, &gro_list->list); + ret = GRO_HELD; + +pull: + grow = skb_gro_offset(skb) - skb_headlen(skb); + if (grow > 0) + gro_pull_from_frag0(skb, grow); +ok: + if (gro_list->count) { + if (!test_bit(bucket, &napi->gro_bitmask)) + __set_bit(bucket, &napi->gro_bitmask); + } else if (test_bit(bucket, &napi->gro_bitmask)) { + __clear_bit(bucket, &napi->gro_bitmask); + } + + return ret; + +normal: + ret = GRO_NORMAL; + goto pull; +} + +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type); + +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + gro_normal_one(napi, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); + break; + + case GRO_HELD: + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + gro_result_t ret; + + skb_mark_napi_id(skb, napi); + trace_napi_gro_receive_entry(skb); + + skb_gro_reset_offset(skb, 0); + + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_receive_exit(ret); + + return ret; +} +EXPORT_SYMBOL(napi_gro_receive); + +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + if (unlikely(skb->pfmemalloc)) { + consume_skb(skb); + return; + } + __skb_pull(skb, skb_headlen(skb)); + /* restore the reserve we had after netdev_alloc_skb_ip_align() */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); + __vlan_hwaccel_clear_tag(skb); + skb->dev = napi->dev; + skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; + skb_shinfo(skb)->gso_size = 0; + if (unlikely(skb->slow_gro)) { + skb_orphan(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + skb->slow_gro = 0; + } + + napi->skb = skb; +} + +struct sk_buff *napi_get_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + + if (!skb) { + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } + } + return skb; +} +EXPORT_SYMBOL(napi_get_frags); + +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + case GRO_HELD: + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + if (ret == GRO_NORMAL) + gro_normal_one(napi, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else + napi_reuse_skb(napi, skb); + break; + + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ +static struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb, hlen); + + if (unlikely(skb_gro_header_hard(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); + if (unlikely(!eth)) { + net_warn_ratelimited("%s: dropping impossible skb from %s\n", + __func__, napi->dev->name); + napi_reuse_skb(napi, skb); + return NULL; + } + } else { + eth = (const struct ethhdr *)skb->data; + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; + } + __skb_pull(skb, hlen); + + /* + * This works because the only protocols we care about don't require + * special handling. + * We'll fix it up properly in napi_frags_finish() + */ + skb->protocol = eth->h_proto; + + return skb; +} + +gro_result_t napi_gro_frags(struct napi_struct *napi) +{ + gro_result_t ret; + struct sk_buff *skb = napi_frags_skb(napi); + + trace_napi_gro_frags_entry(skb); + + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; +} +EXPORT_SYMBOL(napi_gro_frags); + +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 6eb2e5ec2c50..ed5ec5de47f6 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -26,9 +26,9 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { drop: - atomic_long_inc(&dev->rx_dropped); + dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); res = NET_RX_DROP; goto unlock; @@ -81,16 +81,30 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); - netif_napi_add(dev, &cell->napi, gro_cell_poll, - NAPI_POLL_WEIGHT); + netif_napi_add(dev, &cell->napi, gro_cell_poll); napi_enable(&cell->napi); } return 0; } EXPORT_SYMBOL(gro_cells_init); +struct percpu_free_defer { + struct rcu_head rcu; + void __percpu *ptr; +}; + +static void percpu_free_defer_callback(struct rcu_head *head) +{ + struct percpu_free_defer *defer; + + defer = container_of(head, struct percpu_free_defer, rcu); + free_percpu(defer->ptr); + kfree(defer); +} + void gro_cells_destroy(struct gro_cells *gcells) { + struct percpu_free_defer *defer; int i; if (!gcells->cells) @@ -102,12 +116,23 @@ void gro_cells_destroy(struct gro_cells *gcells) __netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } - /* This barrier is needed because netpoll could access dev->napi_list - * under rcu protection. + /* We need to observe an rcu grace period before freeing ->cells, + * because netpoll could access dev->napi_list under rcu protection. + * Try hard using call_rcu() instead of synchronize_rcu(), + * because we might be called from cleanup_net(), and we + * definitely do not want to block this critical task. */ - synchronize_net(); - - free_percpu(gcells->cells); + defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN); + if (likely(defer)) { + defer->ptr = gcells->cells; + call_rcu(&defer->rcu, percpu_free_defer_callback); + } else { + /* We do not hold RTNL at this point, synchronize_net() + * would not be able to expedite this sync. + */ + synchronize_rcu_expedited(); + free_percpu(gcells->cells); + } gcells->cells = NULL; } EXPORT_SYMBOL(gro_cells_destroy); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1a455847da54..aa6cb1f90966 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -18,6 +18,7 @@ #include <linux/bitops.h> #include <linux/types.h> +#include "dev.h" enum lw_bits { LW_URGENT = 0, @@ -55,7 +56,7 @@ static void rfc2863_policy(struct net_device *dev) if (operstate == dev->operstate) return; - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); switch(dev->link_mode) { case IF_LINK_MODE_TESTING: @@ -74,7 +75,7 @@ static void rfc2863_policy(struct net_device *dev) dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } @@ -109,7 +110,7 @@ static void linkwatch_add_event(struct net_device *dev) spin_lock_irqsave(&lweventlist_lock, flags); if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); - dev_hold(dev); + netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); } spin_unlock_irqrestore(&lweventlist_lock, flags); } @@ -166,7 +167,10 @@ static void linkwatch_do_dev(struct net_device *dev) netdev_state_change(dev); } - dev_put(dev); + /* Note: our callers are responsible for calling netdev_tracker_free(). + * This is the reason we use __dev_put() instead of dev_put(). + */ + __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) @@ -209,6 +213,10 @@ static void __linkwatch_run_queue(int urgent_only) list_add_tail(&dev->link_watch_list, &lweventlist); continue; } + /* We must free netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); linkwatch_do_dev(dev); do_dev--; @@ -232,6 +240,10 @@ void linkwatch_forget_dev(struct net_device *dev) if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); clean = 1; + /* We must release netdev tracker under + * the spinlock protection. + */ + netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); if (clean) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 2f7940bcf715..8b6b5e72b217 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -2,6 +2,7 @@ /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> */ +#include <linux/filter.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> @@ -158,10 +159,8 @@ static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) return dst->lwtstate->orig_output(net, sk, skb); } -static int xmit_check_hhlen(struct sk_buff *skb) +static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) { - int hh_len = skb_dst(skb)->dev->hard_header_len; - if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); @@ -273,6 +272,7 @@ static int bpf_xmit(struct sk_buff *skb) bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->xmit.prog) { + int hh_len = dst->dev->hard_header_len; __be16 proto = skb->protocol; int ret; @@ -290,7 +290,7 @@ static int bpf_xmit(struct sk_buff *skb) /* If the header was expanded, headroom might be too * small for L2 header to come, expand as needed. */ - ret = xmit_check_hhlen(skb); + ret = xmit_check_hhlen(skb, hh_len); if (unlikely(ret)) return ret; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 2820aca2173a..6fac2f0ef074 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -50,6 +50,7 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "IOAM6"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: + case LWTUNNEL_ENCAP_XFRM: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ @@ -197,6 +198,10 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla_entype) { + if (nla_len(nla_entype) < sizeof(u16)) { + NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE"); + return -EINVAL; + } encap_type = nla_get_u16(nla_entype); if (lwtunnel_valid_encap_type(encap_type, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index dda12fbd177b..a77a85e357e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -111,7 +111,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return base ? (prandom_u32() % base) + (base >> 1) : 0; + return base ? prandom_u32_max(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); @@ -307,11 +307,35 @@ static int neigh_del_timer(struct neighbour *n) return 0; } -static void pneigh_queue_purge(struct sk_buff_head *list) +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) { + struct sk_buff_head tmp; + unsigned long flags; struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) { + skb_queue_head_init(&tmp); + spin_lock_irqsave(&list->lock, flags); + skb = skb_peek(list); + while (skb != NULL) { + struct sk_buff *skb_next = skb_peek_next(skb, list); + struct net_device *dev = skb->dev; + + if (net == NULL || net_eq(dev_net(dev), net)) { + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (in_dev) + in_dev->arp_parms->qlen--; + rcu_read_unlock(); + __skb_unlink(skb, list); + __skb_queue_tail(&tmp, skb); + } + skb = skb_next; + } + spin_unlock_irqrestore(&list->lock, flags); + + while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } @@ -385,9 +409,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); - - del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL); + if (skb_queue_empty_lockless(&tbl->proxy_queue)) + del_timer_sync(&tbl->proxy_timer); return 0; } @@ -624,7 +648,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, memcpy(n->primary_key, pkey, key_len); n->dev = dev; - dev_hold(dev); + netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC); /* Protocol specific setup. */ if (tbl->constructor && (error = tbl->constructor(n)) < 0) { @@ -770,10 +794,10 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; - dev_hold(dev); + netdev_hold(dev, &n->dev_tracker, GFP_KERNEL); if (tbl->pconstructor && tbl->pconstructor(n)) { - dev_put(dev); + netdev_put(dev, &n->dev_tracker); kfree(n); n = NULL; goto out; @@ -805,7 +829,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); - dev_put(n->dev); + netdev_put(n->dev, &n->dev_tracker); kfree(n); return 0; } @@ -838,7 +862,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); - dev_put(n->dev); + netdev_put(n->dev, &n->dev_tracker); kfree(n); } return -ENOENT; @@ -879,7 +903,7 @@ void neigh_destroy(struct neighbour *neigh) if (dev->netdev_ops->ndo_neigh_destroy) dev->netdev_ops->ndo_neigh_destroy(dev, neigh); - dev_put(dev); + netdev_put(dev, &neigh->dev_tracker); neigh_parms_put(neigh->parms); neigh_dbg(2, "neigh %p is destroyed\n", neigh); @@ -1133,7 +1157,8 @@ out: neigh_release(neigh); } -int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, + const bool immediate_ok) { int rc; bool immediate_probe = false; @@ -1154,18 +1179,23 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); neigh_del_timer(neigh); - neigh->nud_state = NUD_INCOMPLETE; + neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; - next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/100); + if (!immediate_ok) { + next = now + 1; + } else { + immediate_probe = true; + next = now + max(NEIGH_VAR(neigh->parms, + RETRANS_TIME), + HZ / 100); + } neigh_add_timer(neigh, next); - immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; write_unlock_bh(&neigh->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { @@ -1187,7 +1217,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; - kfree_skb(buff); + kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); @@ -1209,7 +1239,7 @@ out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); trace_neigh_event_send_dead(neigh, 1); return 1; } @@ -1571,9 +1601,9 @@ static void neigh_managed_work(struct work_struct *work) write_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) - neigh_event_send(neigh, NULL); + neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, - NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME)); + NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); write_unlock_bh(&tbl->lock); } @@ -1591,8 +1621,15 @@ static void neigh_proxy_process(struct timer_list *t) if (tdif <= 0) { struct net_device *dev = skb->dev; + struct in_device *in_dev; + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); + if (in_dev) + in_dev->arp_parms->qlen--; + rcu_read_unlock(); __skb_unlink(skb, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) { rcu_read_lock(); tbl->proxy_redo(skb); @@ -1617,7 +1654,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, unsigned long sched_next = jiffies + prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); - if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { + if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } @@ -1633,6 +1670,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); + p->qlen++; mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } @@ -1665,13 +1703,14 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, refcount_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); - dev_hold(dev); + p->qlen = 0; + netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; write_pnet(&p->net, net); p->sysctl_table = NULL; if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { - dev_put(dev); + netdev_put(dev, &p->dev_tracker); kfree(p); return NULL; } @@ -1702,7 +1741,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) list_del(&parms->list); parms->dead = 1; write_unlock_bh(&tbl->lock); - dev_put(parms->dev); + netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); @@ -1730,6 +1769,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) refcount_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); + tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) @@ -1781,7 +1821,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl) cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue); + pneigh_queue_purge(&tbl->proxy_queue, NULL); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); @@ -1813,9 +1853,6 @@ static struct neigh_table *neigh_find_table(int family) case AF_INET6: tbl = neigh_tables[NEIGH_ND_TABLE]; break; - case AF_DECnet: - tbl = neigh_tables[NEIGH_DN_TABLE]; - break; } return tbl; @@ -2094,7 +2131,9 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) nla_put_msecs(skb, NDTPA_PROXY_DELAY, NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, - NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD)) + NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) || + nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS, + NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -2249,6 +2288,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, [NDTPA_LOCKTIME] = { .type = NLA_U64 }, + [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 }, }; static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2367,6 +2407,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, nla_get_msecs(tbp[i])); call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); break; + case NDTPA_INTERVAL_PROBE_TIME_MS: + NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS, + nla_get_msecs(tbp[i])); + break; case NDTPA_RETRANS_TIME: NEIGH_VAR_SET(p, RETRANS_TIME, nla_get_msecs(tbp[i])); @@ -3364,7 +3408,7 @@ EXPORT_SYMBOL(neigh_seq_stop); static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) @@ -3381,7 +3425,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { @@ -3401,7 +3445,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v) static int neigh_stat_seq_show(struct seq_file *seq, void *v) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { @@ -3556,6 +3600,22 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } +static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp = *ctl; + int ret; + + int min = msecs_to_jiffies(1); + + tmp.extra1 = &min; + tmp.extra2 = NULL; + + ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos); + neigh_proc_update(ctl, write); + return ret; +} + int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3652,6 +3712,9 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, #define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies) +#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \ + NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive) + #define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \ NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies) @@ -3670,6 +3733,8 @@ static struct neigh_sysctl_table { NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"), NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"), + NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS, + "interval_probe_time_ms"), NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"), NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"), @@ -3722,7 +3787,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; - t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; @@ -3770,10 +3835,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, neigh_proc_base_reachable_time; } - /* Don't export sysctls to unprivileged users */ - if (neigh_parms_net(p)->user_ns != &init_user_ns) - t->neigh_vars[0].procname = NULL; - switch (neigh_parms_family(p)) { case AF_INET: p_name = "ipv4"; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index d8b9dbabd4a4..1ec23bf8b05c 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -4,6 +4,8 @@ #include <linux/seq_file.h> #include <net/wext.h> +#include "dev.h" + #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) @@ -190,12 +192,23 @@ static const struct seq_operations softnet_seq_ops = { .show = softnet_seq_show, }; -static void *ptype_get_idx(loff_t pos) +static void *ptype_get_idx(struct seq_file *seq, loff_t pos) { + struct list_head *ptype_list = NULL; struct packet_type *pt = NULL; + struct net_device *dev; loff_t i = 0; int t; + for_each_netdev_rcu(seq_file_net(seq), dev) { + ptype_list = &dev->ptype_all; + list_for_each_entry_rcu(pt, ptype_list, list) { + if (i == pos) + return pt; + ++i; + } + } + list_for_each_entry_rcu(pt, &ptype_all, list) { if (i == pos) return pt; @@ -216,22 +229,40 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); - return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; + return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net_device *dev; struct packet_type *pt; struct list_head *nxt; int hash; ++*pos; if (v == SEQ_START_TOKEN) - return ptype_get_idx(0); + return ptype_get_idx(seq, 0); pt = v; nxt = pt->list.next; + if (pt->dev) { + if (nxt != &pt->dev->ptype_all) + goto found; + + dev = pt->dev; + for_each_netdev_continue_rcu(seq_file_net(seq), dev) { + if (!list_empty(&dev->ptype_all)) { + nxt = dev->ptype_all.next; + goto found; + } + } + + nxt = ptype_all.next; + goto ptype_all; + } + if (pt->type == htons(ETH_P_ALL)) { +ptype_all: if (nxt != &ptype_all) goto found; hash = 0; @@ -260,7 +291,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && + (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 9c01c642cf9e..8409d41405df 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -24,6 +24,7 @@ #include <linux/of_net.h> #include <linux/cpu.h> +#include "dev.h" #include "net-sysfs.h" #ifdef CONFIG_SYSFS @@ -32,6 +33,7 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; +/* Caller holds RTNL or dev_base_lock */ static inline int dev_isalive(const struct net_device *dev) { return dev->reg_state <= NETREG_REGISTERED; @@ -57,7 +59,7 @@ static ssize_t netdev_show(const struct device *dev, #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sprintf(buf, format_string, dev->field); \ + return sysfs_emit(buf, format_string, dev->field); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -116,13 +118,13 @@ static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, { struct net_device *ndev = to_net_dev(dev); - return sprintf(buf, fmt_dec, dev_get_iflink(ndev)); + return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev)); } static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sprintf(buf, fmt_dec, dev->name_assign_type); + return sysfs_emit(buf, fmt_dec, dev->name_assign_type); } static ssize_t name_assign_type_show(struct device *dev, @@ -192,7 +194,7 @@ static ssize_t carrier_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); return -EINVAL; } @@ -213,11 +215,11 @@ static ssize_t speed_show(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev)) { + if (netif_running(netdev) && netif_device_present(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) - ret = sprintf(buf, fmt_dec, cmd.base.speed); + ret = sysfs_emit(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; @@ -256,7 +258,7 @@ static ssize_t duplex_show(struct device *dev, duplex = "unknown"; break; } - ret = sprintf(buf, "%s\n", duplex); + ret = sysfs_emit(buf, "%s\n", duplex); } } rtnl_unlock(); @@ -270,7 +272,7 @@ static ssize_t testing_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_testing(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev)); return -EINVAL; } @@ -282,7 +284,7 @@ static ssize_t dormant_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_dormant(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev)); return -EINVAL; } @@ -313,7 +315,7 @@ static ssize_t operstate_show(struct device *dev, if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ - return sprintf(buf, "%s\n", operstates[operstate]); + return sysfs_emit(buf, "%s\n", operstates[operstate]); } static DEVICE_ATTR_RO(operstate); @@ -323,9 +325,9 @@ static ssize_t carrier_changes_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - return sprintf(buf, fmt_dec, - atomic_read(&netdev->carrier_up_count) + - atomic_read(&netdev->carrier_down_count)); + return sysfs_emit(buf, fmt_dec, + atomic_read(&netdev->carrier_up_count) + + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); @@ -335,7 +337,7 @@ static ssize_t carrier_up_count_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); } static DEVICE_ATTR_RO(carrier_up_count); @@ -345,7 +347,7 @@ static ssize_t carrier_down_count_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_down_count); @@ -460,7 +462,7 @@ static ssize_t ifalias_show(struct device *dev, ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) - ret = sprintf(buf, "%s\n", tmp); + ret = sysfs_emit(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); @@ -488,14 +490,6 @@ static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - struct net_device *netdev = to_net_dev(dev); - - /* The check is also done in change_proto_down; this helps returning - * early without hitting the trylock/restart in netdev_store. - */ - if (!netdev->netdev_ops->ndo_change_proto_down) - return -EOPNOTSUPP; - return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); @@ -520,7 +514,7 @@ static ssize_t phys_port_id_show(struct device *dev, ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) - ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); } rtnl_unlock(); @@ -549,7 +543,7 @@ static ssize_t phys_port_name_show(struct device *dev, ret = dev_get_phys_port_name(netdev, name, sizeof(name)); if (!ret) - ret = sprintf(buf, "%s\n", name); + ret = sysfs_emit(buf, "%s\n", name); } rtnl_unlock(); @@ -579,7 +573,7 @@ static ssize_t phys_switch_id_show(struct device *dev, ret = dev_get_port_parent_id(netdev, &ppid, false); if (!ret) - ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); } rtnl_unlock(); @@ -597,7 +591,7 @@ static ssize_t threaded_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) - ret = sprintf(buf, fmt_dec, netdev->threaded); + ret = sysfs_emit(buf, fmt_dec, netdev->threaded); rtnl_unlock(); return ret; @@ -679,7 +673,7 @@ static ssize_t netstat_show(const struct device *d, struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); + ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } read_unlock(&dev_base_lock); return ret; @@ -753,7 +747,6 @@ static const struct attribute_group netstat_group = { .attrs = netstat_attrs, }; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) static struct attribute *wireless_attrs[] = { NULL }; @@ -762,7 +755,19 @@ static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; + +static bool wireless_group_needed(struct net_device *ndev) +{ +#if IS_ENABLED(CONFIG_CFG80211) + if (ndev->ieee80211_ptr) + return true; +#endif +#if IS_ENABLED(CONFIG_WIRELESS_EXT) + if (ndev->wireless_handlers) + return true; #endif + return false; +} #else /* CONFIG_SYSFS */ #define net_class_groups NULL @@ -819,7 +824,7 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) for (i = 0; i < map->len; i++) cpumask_set_cpu(map->cpus[i], mask); - len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); + len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask)); rcu_read_unlock(); free_cpumask_var(mask); @@ -831,7 +836,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, { struct rps_map *old_map, *map; cpumask_var_t mask; - int err, cpu, i, hk_flags; + int err, cpu, i; static DEFINE_MUTEX(rps_map_mutex); if (!capable(CAP_NET_ADMIN)) @@ -847,8 +852,8 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, } if (!cpumask_empty(mask)) { - hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; - cpumask_and(mask, mask, housekeeping_cpumask(hk_flags)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); if (cpumask_empty(mask)) { free_cpumask_var(mask); return -EINVAL; @@ -905,7 +910,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, val = (unsigned long)flow_table->mask + 1; rcu_read_unlock(); - return sprintf(buf, "%lu\n", val); + return sysfs_emit(buf, "%lu\n", val); } static void rps_dev_flow_table_release(struct rcu_head *rcu) @@ -1012,7 +1017,7 @@ static void rx_queue_release(struct kobject *kobj) #endif memset(kobj, 0, sizeof(*kobj)); - dev_put(queue->dev); + netdev_put(queue->dev, &queue->dev_tracker); } static const void *rx_queue_namespace(struct kobject *kobj) @@ -1052,7 +1057,7 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ - dev_hold(queue->dev); + netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, @@ -1201,13 +1206,9 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) { - unsigned long trans_timeout; - - spin_lock_irq(&queue->_xmit_lock); - trans_timeout = queue->trans_timeout; - spin_unlock_irq(&queue->_xmit_lock); + unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); - return sprintf(buf, fmt_ulong, trans_timeout); + return sysfs_emit(buf, fmt_ulong, trans_timeout); } static unsigned int get_netdev_queue_index(struct netdev_queue *queue) @@ -1254,15 +1255,15 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, * belongs to the root device it will be reported with just the * traffic class, so just "0" for TC 0 for example. */ - return num_tc < 0 ? sprintf(buf, "%d%d\n", tc, num_tc) : - sprintf(buf, "%d\n", tc); + return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) : + sysfs_emit(buf, "%d\n", tc); } #ifdef CONFIG_XPS static ssize_t tx_maxrate_show(struct netdev_queue *queue, char *buf) { - return sprintf(buf, "%lu\n", queue->tx_maxrate); + return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } static ssize_t tx_maxrate_store(struct netdev_queue *queue, @@ -1316,7 +1317,7 @@ static struct netdev_queue_attribute queue_traffic_class __ro_after_init */ static ssize_t bql_show(char *buf, unsigned int value) { - return sprintf(buf, "%u\n", value); + return sysfs_emit(buf, "%u\n", value); } static ssize_t bql_set(const char *buf, const size_t count, @@ -1345,7 +1346,7 @@ static ssize_t bql_show_hold_time(struct netdev_queue *queue, { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } static ssize_t bql_set_hold_time(struct netdev_queue *queue, @@ -1373,7 +1374,7 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue, { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); + return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed); } static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = @@ -1452,7 +1453,7 @@ static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, for (i = map->len; i--;) { if (map->queues[i] == index) { - set_bit(j, mask); + __set_bit(j, mask); break; } } @@ -1619,7 +1620,7 @@ static void netdev_queue_release(struct kobject *kobj) struct netdev_queue *queue = to_netdev_queue(kobj); memset(kobj, 0, sizeof(*kobj)); - dev_put(queue->dev); + netdev_put(queue->dev, &queue->dev_tracker); } static const void *netdev_queue_namespace(struct kobject *kobj) @@ -1659,7 +1660,7 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ - dev_hold(queue->dev); + netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, @@ -1706,6 +1707,13 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) int i; int error = 0; + /* Tx queue kobjects are allowed to be updated when a device is being + * unregistered, but solely to remove queues from qdiscs. Any path + * adding queues should be fixed. + */ + WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num, + "New queues can't be registered after device unregistration."); + for (i = old_num; i < new_num; i++) { error = netdev_queue_add_kobject(dev, i); if (error) { @@ -1820,6 +1828,9 @@ static void remove_queue_kobjects(struct net_device *dev) net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); + + dev->real_num_rx_queues = 0; + dev->real_num_tx_queues = 0; #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif @@ -1997,14 +2008,8 @@ int netdev_register_kobject(struct net_device *ndev) *groups++ = &netstat_group; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) - if (ndev->ieee80211_ptr) - *groups++ = &wireless_group; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) - else if (ndev->wireless_handlers) + if (wireless_group_needed(ndev)) *groups++ = &wireless_group; -#endif -#endif #endif /* CONFIG_SYSFS */ error = device_add(dev); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 202fa5eacd0f..f64654df71a2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -44,13 +44,7 @@ EXPORT_SYMBOL_GPL(net_rwsem); static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; #endif -struct net init_net = { - .ns.count = REFCOUNT_INIT(1), - .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), -#ifdef CONFIG_KEYS - .key_domain = &init_net_key_domain, -#endif -}; +struct net init_net; EXPORT_SYMBOL(init_net); static bool init_net_initialized; @@ -123,6 +117,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) static int ops_init(const struct pernet_operations *ops, struct net *net) { + struct net_generic *ng; int err = -ENOMEM; void *data = NULL; @@ -141,7 +136,13 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) if (!err) return 0; + if (ops->id && ops->size) { cleanup: + ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&pernet_ops_rwsem)); + ng->ptr[*ops->id] = NULL; + } + kfree(data); out: @@ -164,8 +165,10 @@ static void ops_exit_list(const struct pernet_operations *ops, { struct net *net; if (ops->exit) { - list_for_each_entry(net, net_exit_list, exit_list) + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); + cond_resched(); + } } if (ops->exit_batch) ops->exit_batch(net_exit_list); @@ -299,6 +302,7 @@ struct net *get_net_ns_by_id(const struct net *net, int id) return peer; } +EXPORT_SYMBOL_GPL(get_net_ns_by_id); /* * setup_net runs the initializers for the network namespace object. @@ -311,6 +315,8 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) LIST_HEAD(net_exit_list); refcount_set(&net->ns.count, 1); + ref_tracker_dir_init(&net->refcnt_tracker, 128); + refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); preempt_disable(); @@ -359,6 +365,8 @@ out_undo: static int __net_init net_defaults_init_net(struct net *net) { net->core.sysctl_somaxconn = SOMAXCONN; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + return 0; } @@ -635,6 +643,7 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { + ref_tracker_dir_exit(&net->refcnt_tracker); /* Cleanup the network namespace in process context */ if (llist_add(&net->cleanup_list, &cleanup_list)) queue_work(netns_wq, &net_cleanup_work); @@ -1079,7 +1088,7 @@ out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } -static int __init net_ns_init(void) +void __init net_ns_init(void) { struct net_generic *ng; @@ -1100,6 +1109,9 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); +#ifdef CONFIG_KEYS + init_net.key_domain = &init_net_key_domain; +#endif down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); @@ -1114,12 +1126,8 @@ static int __init net_ns_init(void) RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, RTNL_FLAG_DOIT_UNLOCKED); - - return 0; } -pure_initcall(net_ns_init); - static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) { ops_pre_exit_list(ops, net_exit_list); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 1a6a86693b74..d6a70aeaa503 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -66,7 +66,7 @@ struct update_classid_context { #define UPDATE_CLASSID_BATCH 1000 -static int update_classid_sock(const void *v, struct file *file, unsigned n) +static int update_classid_sock(const void *v, struct file *file, unsigned int n) { struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index edfc0f8011f8..9be762e1d042 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -556,7 +556,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) if ((delim = strchr(cur, ',')) == NULL) goto parse_failed; *delim = 0; - strlcpy(np->dev_name, cur, sizeof(np->dev_name)); + strscpy(np->dev_name, cur, sizeof(np->dev_name)); cur = delim; } cur++; @@ -610,7 +610,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) int err; np->dev = ndev; - strlcpy(np->dev_name, ndev->name, IFNAMSIZ); + strscpy(np->dev_name, ndev->name, IFNAMSIZ); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", @@ -776,7 +776,7 @@ put_noaddr: err = __netpoll_setup(np, ndev); if (err) goto put; - + netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL); rtnl_unlock(); return 0; @@ -853,7 +853,7 @@ void netpoll_cleanup(struct netpoll *np) if (!np->dev) goto out; __netpoll_cleanup(np); - dev_put(np->dev); + netdev_put(np->dev, &np->dev_tracker); np->dev = NULL; out: rtnl_unlock(); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 1a6978427d6c..9b203d8660e4 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -16,8 +16,9 @@ #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> -#include <linux/mm.h> /* for __put_page() */ +#include <linux/mm.h> /* for put_page() */ #include <linux/poison.h> +#include <linux/ethtool.h> #include <trace/events/page_pool.h> @@ -26,6 +27,112 @@ #define BIAS_MAX LONG_MAX +#ifdef CONFIG_PAGE_POOL_STATS +/* alloc_stat_inc is intended to be used in softirq context */ +#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) +/* recycle_stat_inc is safe to use when preemption is possible. */ +#define recycle_stat_inc(pool, __stat) \ + do { \ + struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ + this_cpu_inc(s->__stat); \ + } while (0) + +#define recycle_stat_add(pool, __stat, val) \ + do { \ + struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ + this_cpu_add(s->__stat, val); \ + } while (0) + +static const char pp_stats[][ETH_GSTRING_LEN] = { + "rx_pp_alloc_fast", + "rx_pp_alloc_slow", + "rx_pp_alloc_slow_ho", + "rx_pp_alloc_empty", + "rx_pp_alloc_refill", + "rx_pp_alloc_waive", + "rx_pp_recycle_cached", + "rx_pp_recycle_cache_full", + "rx_pp_recycle_ring", + "rx_pp_recycle_ring_full", + "rx_pp_recycle_released_ref", +}; + +bool page_pool_get_stats(struct page_pool *pool, + struct page_pool_stats *stats) +{ + int cpu = 0; + + if (!stats) + return false; + + /* The caller is responsible to initialize stats. */ + stats->alloc_stats.fast += pool->alloc_stats.fast; + stats->alloc_stats.slow += pool->alloc_stats.slow; + stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; + stats->alloc_stats.empty += pool->alloc_stats.empty; + stats->alloc_stats.refill += pool->alloc_stats.refill; + stats->alloc_stats.waive += pool->alloc_stats.waive; + + for_each_possible_cpu(cpu) { + const struct page_pool_recycle_stats *pcpu = + per_cpu_ptr(pool->recycle_stats, cpu); + + stats->recycle_stats.cached += pcpu->cached; + stats->recycle_stats.cache_full += pcpu->cache_full; + stats->recycle_stats.ring += pcpu->ring; + stats->recycle_stats.ring_full += pcpu->ring_full; + stats->recycle_stats.released_refcnt += pcpu->released_refcnt; + } + + return true; +} +EXPORT_SYMBOL(page_pool_get_stats); + +u8 *page_pool_ethtool_stats_get_strings(u8 *data) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { + memcpy(data, pp_stats[i], ETH_GSTRING_LEN); + data += ETH_GSTRING_LEN; + } + + return data; +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); + +int page_pool_ethtool_stats_get_count(void) +{ + return ARRAY_SIZE(pp_stats); +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); + +u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +{ + struct page_pool_stats *pool_stats = stats; + + *data++ = pool_stats->alloc_stats.fast; + *data++ = pool_stats->alloc_stats.slow; + *data++ = pool_stats->alloc_stats.slow_high_order; + *data++ = pool_stats->alloc_stats.empty; + *data++ = pool_stats->alloc_stats.refill; + *data++ = pool_stats->alloc_stats.waive; + *data++ = pool_stats->recycle_stats.cached; + *data++ = pool_stats->recycle_stats.cache_full; + *data++ = pool_stats->recycle_stats.ring; + *data++ = pool_stats->recycle_stats.ring_full; + *data++ = pool_stats->recycle_stats.released_refcnt; + + return data; +} +EXPORT_SYMBOL(page_pool_ethtool_stats_get); + +#else +#define alloc_stat_inc(pool, __stat) +#define recycle_stat_inc(pool, __stat) +#define recycle_stat_add(pool, __stat, val) +#endif + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -73,6 +180,12 @@ static int page_pool_init(struct page_pool *pool, pool->p.flags & PP_FLAG_PAGE_FRAG) return -EINVAL; +#ifdef CONFIG_PAGE_POOL_STATS + pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); + if (!pool->recycle_stats) + return -ENOMEM; +#endif + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -117,8 +230,10 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ - if (__ptr_ring_empty(r)) + if (__ptr_ring_empty(r)) { + alloc_stat_inc(pool, empty); return NULL; + } /* Softirq guarantee CPU and thus NUMA node is stable. This, * assumes CPU refilling driver RX-ring will also run RX-NAPI. @@ -130,9 +245,6 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ #endif - /* Slower-path: Get pages from locked ring queue */ - spin_lock(&r->consumer_lock); - /* Refill alloc array, but only if NUMA match */ do { page = __ptr_ring_consume(r); @@ -148,16 +260,18 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) * This limit stress on page buddy alloactor. */ page_pool_return_page(pool, page); + alloc_stat_inc(pool, waive); page = NULL; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ - if (likely(pool->alloc.count > 0)) + if (likely(pool->alloc.count > 0)) { page = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, refill); + } - spin_unlock(&r->consumer_lock); return page; } @@ -170,6 +284,7 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) if (likely(pool->alloc.count)) { /* Fast-path */ page = pool->alloc.cache[--pool->alloc.count]; + alloc_stat_inc(pool, fast); } else { page = page_pool_refill_alloc_cache(pool); } @@ -217,6 +332,8 @@ static void page_pool_set_pp_info(struct page_pool *pool, { page->pp = pool; page->pp_magic |= PP_SIGNATURE; + if (pool->p.init_callback) + pool->p.init_callback(page, pool->p.init_arg); } static void page_pool_clear_pp_info(struct page *page) @@ -241,6 +358,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } + alloc_stat_inc(pool, slow_high_order); page_pool_set_pp_info(pool, page); /* Track how many pages are held 'in-flight' */ @@ -271,7 +389,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache); + nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, + pool->alloc.cache); if (unlikely(!nr_pages)) return NULL; @@ -295,10 +414,12 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, } /* Return last page */ - if (likely(pool->alloc.count > 0)) + if (likely(pool->alloc.count > 0)) { page = pool->alloc.cache[--pool->alloc.count]; - else + alloc_stat_inc(pool, slow); + } else { page = NULL; + } /* When page just alloc'ed is should/must have refcnt 1. */ return page; @@ -396,7 +517,12 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) else ret = ptr_ring_produce_bh(&pool->ring, page); - return (ret == 0) ? true : false; + if (!ret) { + recycle_stat_inc(pool, ring); + return true; + } + + return false; } /* Only allow direct recycling in special circumstances, into the @@ -407,11 +533,14 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) static bool page_pool_recycle_in_cache(struct page *page, struct page_pool *pool) { - if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { + recycle_stat_inc(pool, cache_full); return false; + } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ pool->alloc.cache[pool->alloc.count++] = page; + recycle_stat_inc(pool, cached); return true; } @@ -425,11 +554,6 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { - /* It is not the last user for the page frag case */ - if (pool->p.flags & PP_FLAG_PAGE_FRAG && - page_pool_atomic_sub_frag_count_return(page, 1)) - return NULL; - /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -466,6 +590,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ + recycle_stat_inc(pool, released_refcnt); /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); @@ -473,16 +598,17 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, return NULL; } -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); if (page && !page_pool_recycle_in_ring(pool, page)) { /* Cache full, fallback to free pages */ + recycle_stat_inc(pool, ring_full); page_pool_return_page(pool, page); } } -EXPORT_SYMBOL(page_pool_put_page); +EXPORT_SYMBOL(page_pool_put_defragged_page); /* Caller must not use data area after call, as this function overwrites it */ void page_pool_put_page_bulk(struct page_pool *pool, void **data, @@ -493,6 +619,10 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, for (i = 0; i < count; i++) { struct page *page = virt_to_head_page(data[i]); + /* It is not the last user for the page frag case */ + if (!page_pool_is_last_frag(pool, page)) + continue; + page = __page_pool_put_page(pool, page, -1, false); /* Approved for bulk recycling in ptr_ring cache */ if (page) @@ -505,9 +635,13 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, /* Bulk producer into ptr_ring page_pool cache */ page_pool_ring_lock(pool); for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, data[i])) - break; /* ring full */ + if (__ptr_ring_produce(&pool->ring, data[i])) { + /* ring full */ + recycle_stat_inc(pool, ring_full); + break; + } } + recycle_stat_add(pool, ring, i); page_pool_ring_unlock(pool); /* Hopefully all pages was return into ptr_ring */ @@ -528,8 +662,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool, long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ - if (likely(page_pool_atomic_sub_frag_count_return(page, - drain_count))) + if (likely(page_pool_defrag_page(page, drain_count))) return NULL; if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { @@ -550,8 +683,7 @@ static void page_pool_free_frag(struct page_pool *pool) pool->frag_page = NULL; - if (!page || - page_pool_atomic_sub_frag_count_return(page, drain_count)) + if (!page || page_pool_defrag_page(page, drain_count)) return; page_pool_return_page(pool, page); @@ -573,8 +705,10 @@ struct page *page_pool_alloc_frag(struct page_pool *pool, if (page && *offset + size > max_size) { page = page_pool_drain_frag(pool, page); - if (page) + if (page) { + alloc_stat_inc(pool, fast); goto frag_reset; + } } if (!page) { @@ -590,12 +724,13 @@ frag_reset: pool->frag_users = 1; *offset = 0; pool->frag_offset = size; - page_pool_set_frag_count(page, BIAS_MAX); + page_pool_fragment_page(page, BIAS_MAX); return page; } pool->frag_users++; pool->frag_offset = *offset + size; + alloc_stat_inc(pool, fast); return page; } EXPORT_SYMBOL(page_pool_alloc_frag); @@ -625,6 +760,9 @@ static void page_pool_free(struct page_pool *pool) if (pool->p.flags & PP_FLAG_DMA_MAP) put_device(pool->p.dev); +#ifdef CONFIG_PAGE_POOL_STATS + free_percpu(pool->recycle_stats); +#endif kfree(pool); } @@ -691,10 +829,12 @@ static void page_pool_release_retry(struct work_struct *wq) schedule_delayed_work(&pool->release_dw, DEFER_TIME); } -void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), + struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; + pool->xdp_mem_id = mem->id; } void page_pool_destroy(struct page_pool *pool) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a3d74e2704c4..c3763056c554 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -410,6 +410,7 @@ struct pktgen_dev { * device name (not when the inject is * started as it used to do.) */ + netdevice_tracker dev_tracker; char odevname[32]; struct flow_state *flows; unsigned int cflows; /* Concurrent flows (config) */ @@ -545,7 +546,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, static int pgctrl_open(struct inode *inode, struct file *file) { - return single_open(file, pgctrl_show, PDE_DATA(inode)); + return single_open(file, pgctrl_show, pde_data(inode)); } static const struct proc_ops pktgen_proc_ops = { @@ -1810,7 +1811,7 @@ static ssize_t pktgen_if_write(struct file *file, static int pktgen_if_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_if_show, PDE_DATA(inode)); + return single_open(file, pktgen_if_show, pde_data(inode)); } static const struct proc_ops pktgen_if_proc_ops = { @@ -1947,7 +1948,7 @@ out: static int pktgen_thread_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_thread_show, PDE_DATA(inode)); + return single_open(file, pktgen_thread_show, pde_data(inode)); } static const struct proc_ops pktgen_thread_proc_ops = { @@ -2099,7 +2100,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn, /* Clean old setups */ if (pkt_dev->odev) { - dev_put(pkt_dev->odev); + netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker); pkt_dev->odev = NULL; } @@ -2117,6 +2118,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn, err = -ENETDOWN; } else { pkt_dev->odev = odev; + netdev_tracker_alloc(odev, &pkt_dev->dev_tracker, GFP_KERNEL); return 0; } @@ -2322,7 +2324,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) pkt_dev->curfl = 0; /*reset */ } } else { - flow = prandom_u32() % pkt_dev->cflows; + flow = prandom_u32_max(pkt_dev->cflows); pkt_dev->curfl = flow; if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2378,10 +2380,9 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = prandom_u32() % - (pkt_dev->queue_map_max - - pkt_dev->queue_map_min + 1) - + pkt_dev->queue_map_min; + t = prandom_u32_max(pkt_dev->queue_map_max - + pkt_dev->queue_map_min + 1) + + pkt_dev->queue_map_min; } else { t = pkt_dev->cur_queue_map + 1; if (t > pkt_dev->queue_map_max) @@ -2410,7 +2411,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = prandom_u32() % pkt_dev->src_mac_count; + mc = prandom_u32_max(pkt_dev->src_mac_count); else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset >= @@ -2436,7 +2437,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = prandom_u32() % pkt_dev->dst_mac_count; + mc = prandom_u32_max(pkt_dev->dst_mac_count); else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2463,23 +2464,23 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | - ((__force __be32)prandom_u32() & + ((__force __be32)get_random_u32() & htonl(0x000fffff)); } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = prandom_u32() & (4096 - 1); + pkt_dev->vlan_id = prandom_u32_max(4096); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = prandom_u32() & (4096 - 1); + pkt_dev->svlan_id = prandom_u32_max(4096); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = prandom_u32() % - (pkt_dev->udp_src_max - pkt_dev->udp_src_min) - + pkt_dev->udp_src_min; + pkt_dev->cur_udp_src = prandom_u32_max( + pkt_dev->udp_src_max - pkt_dev->udp_src_min) + + pkt_dev->udp_src_min; else { pkt_dev->cur_udp_src++; @@ -2490,9 +2491,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = prandom_u32() % - (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) - + pkt_dev->udp_dst_min; + pkt_dev->cur_udp_dst = prandom_u32_max( + pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + + pkt_dev->udp_dst_min; } else { pkt_dev->cur_udp_dst++; if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) @@ -2507,7 +2508,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) - t = prandom_u32() % (imx - imn) + imn; + t = prandom_u32_max(imx - imn) + imn; else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2529,8 +2530,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_IPDST_RND) { do { - t = prandom_u32() % - (imx - imn) + imn; + t = prandom_u32_max(imx - imn) + + imn; s = htonl(t); } while (ipv4_is_loopback(s) || ipv4_is_multicast(s) || @@ -2567,7 +2568,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < 4; i++) { pkt_dev->cur_in6_daddr.s6_addr32[i] = - (((__force __be32)prandom_u32() | + (((__force __be32)get_random_u32() | pkt_dev->min_in6_daddr.s6_addr32[i]) & pkt_dev->max_in6_daddr.s6_addr32[i]); } @@ -2577,9 +2578,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; if (pkt_dev->flags & F_TXSIZE_RND) { - t = prandom_u32() % - (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) - + pkt_dev->min_pkt_size; + t = prandom_u32_max(pkt_dev->max_pkt_size - + pkt_dev->min_pkt_size) + + pkt_dev->min_pkt_size; } else { t = pkt_dev->cur_pkt_size + 1; if (t > pkt_dev->max_pkt_size) @@ -2588,7 +2589,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->cur_pkt_size = t; } else if (pkt_dev->n_imix_entries > 0) { struct imix_pkt *entry; - __u32 t = prandom_u32() % IMIX_PRECISION; + __u32 t = prandom_u32_max(IMIX_PRECISION); __u8 entry_index = pkt_dev->imix_distribution[t]; entry = &pkt_dev->imix_entries[entry_index]; @@ -3805,7 +3806,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) return add_dev_to_thread(t, pkt_dev); out2: - dev_put(pkt_dev->odev); + netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker); out1: #ifdef CONFIG_XFRM free_SAs(pkt_dev); @@ -3899,7 +3900,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Dis-associate from the interface */ if (pkt_dev->odev) { - dev_put(pkt_dev->odev); + netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker); pkt_dev->odev = NULL; } diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index dd4cf01d1e0a..598041b0499e 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -137,6 +137,18 @@ struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type) } EXPORT_SYMBOL_GPL(ptp_parse_header); +bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type) +{ + struct ptp_header *hdr; + + hdr = ptp_parse_header(skb, type); + if (!hdr) + return false; + + return ptp_get_msgtype(hdr, type) == PTP_MSGTYPE_SYNC; +} +EXPORT_SYMBOL_GPL(ptp_msg_is_sync); + void __init ptp_classifier_init(void) { static struct sock_filter ptp_filter[] __initdata = { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2af8aeeadadf..74864dc46a7e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -54,6 +54,8 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include "dev.h" + #define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 40 @@ -95,6 +97,39 @@ void __rtnl_unlock(void) defer_kfree_skb_list = NULL; + /* Ensure that we didn't actually add any TODO item when __rtnl_unlock() + * is used. In some places, e.g. in cfg80211, we have code that will do + * something like + * rtnl_lock() + * wiphy_lock() + * ... + * rtnl_unlock() + * + * and because netdev_run_todo() acquires the RTNL for items on the list + * we could cause a situation such as this: + * Thread 1 Thread 2 + * rtnl_lock() + * unregister_netdevice() + * __rtnl_unlock() + * rtnl_lock() + * wiphy_lock() + * rtnl_unlock() + * netdev_run_todo() + * __rtnl_unlock() + * + * // list not empty now + * // because of thread 2 + * rtnl_lock() + * while (!list_empty(...)) + * rtnl_lock() + * wiphy_lock() + * **** DEADLOCK **** + * + * However, usage of __rtnl_unlock() is rare, and so we can ensure that + * it's not used in cases where something is added to do the list. + */ + WARN_ON(!list_empty(&net_todo_list)); + mutex_unlock(&rtnl_mutex); while (head) { @@ -214,6 +249,8 @@ static int rtnl_register_internal(struct module *owner, if (dumpit) link->dumpit = dumpit; + WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL && + (flags & RTNL_FLAG_BULK_DEL_SUPPORTED)); link->flags |= flags; /* publish protocol:msgtype */ @@ -459,7 +496,7 @@ static void rtnl_lock_unregistering_all(void) * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { - if (net->dev_unreg_count > 0) { + if (atomic_read(&net->dev_unreg_count) > 0) { unregistering = true; break; } @@ -829,22 +866,20 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; case IF_OPER_TESTING: - if (operstate == IF_OPER_UP || - operstate == IF_OPER_UNKNOWN) + if (netif_oper_up(dev)) operstate = IF_OPER_TESTING; break; case IF_OPER_DORMANT: - if (operstate == IF_OPER_UP || - operstate == IF_OPER_UNKNOWN) + if (netif_oper_up(dev)) operstate = IF_OPER_DORMANT; break; } if (dev->operstate != operstate) { - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); netdev_state_change(dev); } } @@ -1022,10 +1057,14 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + + nla_total_size(4) /* IFLA_ALLMULTI */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -1698,6 +1737,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; + struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1715,6 +1755,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; + qdisc = rtnl_dereference(dev->qdisc); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, @@ -1725,16 +1766,20 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || + nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (dev->qdisc && - nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || + (qdisc && + nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + @@ -1880,6 +1925,10 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, + [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, + [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, + [IFLA_ALLMULTI] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2264,6 +2313,19 @@ invalid_attr: return -EINVAL; } +static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, + int max_tx_rate) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_set_vf_rate) + return -EOPNOTSUPP; + if (max_tx_rate && max_tx_rate < min_tx_rate) + return -EINVAL; + + return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); +} + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { @@ -2393,11 +2455,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (err < 0) return err; - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_rate) - err = ops->ndo_set_vf_rate(dev, ivt->vf, - ivf.min_tx_rate, - ivt->rate); + err = rtnl_set_vf_rate(dev, ivt->vf, + ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } @@ -2407,11 +2466,9 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (ivt->vf >= INT_MAX) return -EINVAL; - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_rate) - err = ops->ndo_set_vf_rate(dev, ivt->vf, - ivt->min_tx_rate, - ivt->max_tx_rate); + + err = rtnl_set_vf_rate(dev, ivt->vf, + ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } @@ -2539,13 +2596,12 @@ static int do_set_proto_down(struct net_device *dev, struct netlink_ext_ack *extack) { struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; - const struct net_device_ops *ops = dev->netdev_ops; unsigned long mask = 0; u32 value; bool proto_down; int err; - if (!ops->ndo_change_proto_down) { + if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } @@ -2595,17 +2651,23 @@ static int do_set_proto_down(struct net_device *dev, static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, - struct nlattr **tb, char *ifname, int status) + struct nlattr **tb, int status) { const struct net_device_ops *ops = dev->netdev_ops; + char ifname[IFNAMSIZ]; int err; err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; + if (tb[IFLA_IFNAME]) + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else + ifname[0] = '\0'; + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { - const char *pat = ifname && ifname[0] ? ifname : NULL; + const char *pat = ifname[0] ? ifname : NULL; struct net *net; int new_ifindex; @@ -2715,13 +2777,6 @@ static int do_setlink(const struct sk_buff *skb, call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } - if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), - extack); - if (err < 0) - goto errout; - } - if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) @@ -2729,6 +2784,13 @@ static int do_setlink(const struct sk_buff *skb, status |= DO_SETLINK_MODIFIED; } + if (ifm->ifi_flags || ifm->ifi_change) { + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); + if (err < 0) + goto errout; + } + if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) @@ -2748,7 +2810,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); - if (max_size > GSO_MAX_SIZE) { + if (max_size > dev->tso_max_size) { err = -EINVAL; goto errout; } @@ -2762,13 +2824,22 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); - if (max_segs > GSO_MAX_SEGS) { + if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) { err = -EINVAL; goto errout; } if (dev->gso_max_segs ^ max_segs) { - dev->gso_max_segs = max_segs; + netif_set_gso_max_segs(dev, max_segs); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GRO_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); + + if (dev->gro_max_size ^ gro_max_size) { + netif_set_gro_max_size(dev, gro_max_size); status |= DO_SETLINK_MODIFIED; } } @@ -2779,11 +2850,11 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; dev->link_mode = value; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } if (tb[IFLA_VFINFO_LIST]) { @@ -2952,21 +3023,16 @@ errout: } static struct net_device *rtnl_dev_get(struct net *net, - struct nlattr *ifname_attr, - struct nlattr *altifname_attr, - char *ifname) -{ - char buffer[ALTIFNAMSIZ]; - - if (!ifname) { - ifname = buffer; - if (ifname_attr) - nla_strscpy(ifname, ifname_attr, IFNAMSIZ); - else if (altifname_attr) - nla_strscpy(ifname, altifname_attr, ALTIFNAMSIZ); - else - return NULL; - } + struct nlattr *tb[]) +{ + char ifname[ALTIFNAMSIZ]; + + if (tb[IFLA_IFNAME]) + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else if (tb[IFLA_ALT_IFNAME]) + nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ); + else + return NULL; return __dev_get_by_name(net, ifname); } @@ -2979,7 +3045,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; int err; struct nlattr *tb[IFLA_MAX+1]; - char ifname[IFNAMSIZ]; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -2990,17 +3055,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - if (tb[IFLA_IFNAME]) - nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - else - ifname[0] = '\0'; - err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); + dev = rtnl_dev_get(net, tb); else goto errout; @@ -3009,7 +3069,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0); + err = do_setlink(skb, dev, ifm, extack, tb, 0); errout: return err; } @@ -3098,15 +3158,14 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(net, tb[IFLA_IFNAME], - tb[IFLA_ALT_IFNAME], NULL); + dev = rtnl_dev_get(net, tb); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else goto out; if (!dev) { - if (tb[IFLA_IFNAME] || ifm->ifi_index > 0) + if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0) err = -ENODEV; goto out; @@ -3222,7 +3281,9 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) - dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); + if (tb[IFLA_GRO_MAX_SIZE]) + netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); return dev; } @@ -3239,7 +3300,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = do_setlink(skb, dev, ifm, extack, tb, NULL, 0); + err = do_setlink(skb, dev, ifm, extack, tb, 0); if (err < 0) return err; } @@ -3248,24 +3309,118 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attr, struct netlink_ext_ack *extack) +static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, + const struct rtnl_link_ops *ops, + struct nlattr **tb, struct nlattr **data, + struct netlink_ext_ack *extack) { - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; unsigned char name_assign_type = NET_NAME_USER; + struct net *net = sock_net(skb->sk); + struct net *dest_net, *link_net; + struct net_device *dev; + char ifname[IFNAMSIZ]; + int err; + + if (!ops->alloc && !ops->setup) + return -EOPNOTSUPP; + + if (tb[IFLA_IFNAME]) { + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + } else { + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + name_assign_type = NET_NAME_ENUM; + } + + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + + link_net = get_net_ns_by_id(dest_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + err = -EINVAL; + goto out; + } + err = -EPERM; + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) + goto out; + } else { + link_net = NULL; + } + + dev = rtnl_create_link(link_net ? : dest_net, ifname, + name_assign_type, ops, tb, extack); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out; + } + + dev->ifindex = ifm->ifi_index; + + if (ops->newlink) + err = ops->newlink(link_net ? : net, dev, tb, data, extack); + else + err = register_netdevice(dev); + if (err < 0) { + free_netdev(dev); + goto out; + } + + err = rtnl_configure_link(dev, ifm); + if (err < 0) + goto out_unregister; + if (link_net) { + err = dev_change_net_namespace(dev, dest_net, ifname); + if (err < 0) + goto out_unregister; + } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); + if (err) + goto out_unregister; + } +out: + if (link_net) + put_net(link_net); + put_net(dest_net); + return err; +out_unregister: + if (ops->newlink) { + LIST_HEAD(list_kill); + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); + } + goto out; +} + +struct rtnl_newlink_tbs { + struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; +}; + +static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct rtnl_newlink_tbs *tbs, + struct netlink_ext_ack *extack) +{ struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; - const struct rtnl_link_ops *m_ops = NULL; - struct net_device *master_dev = NULL; + struct nlattr ** const tb = tbs->tb; + const struct rtnl_link_ops *m_ops; + struct net_device *master_dev; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - struct nlattr *tb[IFLA_MAX + 1]; - struct net *dest_net, *link_net; struct nlattr **slave_data; char kind[MODULE_NAME_LEN]; struct net_device *dev; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr **data; + bool link_specified; int err; #ifdef CONFIG_MODULES @@ -3280,19 +3435,20 @@ replay: if (err < 0) return err; - if (tb[IFLA_IFNAME]) - nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - else - ifname[0] = '\0'; - ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) + if (ifm->ifi_index > 0) { + link_specified = true; dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); - else + } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { + link_specified = true; + dev = rtnl_dev_get(net, tb); + } else { + link_specified = false; dev = NULL; + } + master_dev = NULL; + m_ops = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) @@ -3326,12 +3482,12 @@ replay: return -EINVAL; if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested_deprecated(attr, ops->maxtype, + err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (err < 0) return err; - data = attr; + data = tbs->attr; } if (ops->validate) { err = ops->validate(tb, data, extack); @@ -3347,14 +3503,14 @@ replay: if (m_ops->slave_maxtype && linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested_deprecated(slave_attr, + err = nla_parse_nested_deprecated(tbs->slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, extack); if (err < 0) return err; - slave_data = slave_attr; + slave_data = tbs->slave_attr; } } @@ -3388,11 +3544,16 @@ replay: status |= DO_SETLINK_NOTIFY; } - return do_setlink(skb, dev, ifm, extack, tb, ifname, status); + return do_setlink(skb, dev, ifm, extack, tb, status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { - if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, + * or it's for a group + */ + if (link_specified) + return -ENODEV; + if (tb[IFLA_GROUP]) return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); @@ -3417,94 +3578,21 @@ replay: return -EOPNOTSUPP; } - if (!ops->alloc && !ops->setup) - return -EOPNOTSUPP; - - if (!ifname[0]) { - snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); - name_assign_type = NET_NAME_ENUM; - } - - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - - link_net = get_net_ns_by_id(dest_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } else { - link_net = NULL; - } - - dev = rtnl_create_link(link_net ? : dest_net, ifname, - name_assign_type, ops, tb, extack); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); - goto out; - } - - dev->ifindex = ifm->ifi_index; - - if (ops->newlink) - err = ops->newlink(link_net ? : net, dev, tb, data, extack); - else - err = register_netdevice(dev); - if (err < 0) { - free_netdev(dev); - goto out; - } - - err = rtnl_configure_link(dev, ifm); - if (err < 0) - goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); - if (err < 0) - goto out_unregister; - } - if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); - if (err) - goto out_unregister; - } -out: - if (link_net) - put_net(link_net); - put_net(dest_net); - return err; -out_unregister: - if (ops->newlink) { - LIST_HEAD(list_kill); - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; + return rtnl_newlink_create(skb, ifm, ops, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct nlattr **attr; + struct rtnl_newlink_tbs *tbs; int ret; - attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); - if (!attr) + tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); + if (!tbs) return -ENOMEM; - ret = __rtnl_newlink(skb, nlh, attr, extack); - kfree(attr); + ret = __rtnl_newlink(skb, nlh, tbs, extack); + kfree(tbs); return ret; } @@ -3592,8 +3680,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME], - tb[IFLA_ALT_IFNAME], NULL); + dev = rtnl_dev_get(tgt_net, tb); else goto out; @@ -3627,13 +3714,24 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, bool *changed, struct netlink_ext_ack *extack) { char *alt_ifname; + size_t size; int err; err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); if (err) return err; - alt_ifname = nla_strdup(attr, GFP_KERNEL); + if (cmd == RTM_NEWLINKPROP) { + size = rtnl_prop_list_size(dev); + size += nla_total_size(ALTIFNAMSIZ); + if (size >= U16_MAX) { + NL_SET_ERR_MSG(extack, + "effective property list too long"); + return -EINVAL; + } + } + + alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; @@ -3677,8 +3775,7 @@ static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) - dev = rtnl_dev_get(net, tb[IFLA_IFNAME], - tb[IFLA_ALT_IFNAME], NULL); + dev = rtnl_dev_get(net, tb); else return -EINVAL; @@ -4096,22 +4193,36 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_del); +static const struct nla_policy fdb_del_bulk_policy[NDA_MAX + 1] = { + [NDA_VLAN] = { .type = NLA_U16 }, + [NDA_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), + [NDA_NDM_STATE_MASK] = { .type = NLA_U16 }, + [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 }, +}; + static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK); struct net *net = sock_net(skb->sk); + const struct net_device_ops *ops; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; - __u8 *addr; + __u8 *addr = NULL; int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, - extack); + if (!del_bulk) { + err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, + NULL, extack); + } else { + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, + fdb_del_bulk_policy, extack); + } if (err < 0) return err; @@ -4127,9 +4238,12 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENODEV; } - if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { - NL_SET_ERR_MSG(extack, "invalid address"); - return -EINVAL; + if (!del_bulk) { + if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { + NL_SET_ERR_MSG(extack, "invalid address"); + return -EINVAL; + } + addr = nla_data(tb[NDA_LLADDR]); } if (dev->type != ARPHRD_ETHER) { @@ -4137,8 +4251,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } - addr = nla_data(tb[NDA_LLADDR]); - err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; @@ -4149,10 +4261,16 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); - const struct net_device_ops *ops = br_dev->netdev_ops; - if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); + ops = br_dev->netdev_ops; + if (!del_bulk) { + if (ops->ndo_fdb_del) + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + } else { + if (ops->ndo_fdb_del_bulk) + err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid, + extack); + } if (err) goto out; @@ -4162,15 +4280,24 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { - if (dev->netdev_ops->ndo_fdb_del) - err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr, - vid); - else - err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); + ops = dev->netdev_ops; + if (!del_bulk) { + if (ops->ndo_fdb_del) + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + else + err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); + } else { + /* in case err was cleared by NTF_MASTER call */ + err = -EOPNOTSUPP; + if (ops->ndo_fdb_del_bulk) + err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid, + extack); + } if (!err) { - rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, - ndm->ndm_state); + if (!del_bulk) + rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, + ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } @@ -5023,82 +5150,259 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) (!idxattr || idxattr == attrid); } -#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1) -static int rtnl_get_offload_stats_attr_size(int attr_id) +static bool +rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id) { - switch (attr_id) { - case IFLA_OFFLOAD_XSTATS_CPU_HIT: - return sizeof(struct rtnl_link_stats64); - } + return dev->netdev_ops && + dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats && + dev->netdev_ops->ndo_has_offload_stats(dev, attr_id); +} - return 0; +static unsigned int +rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id) +{ + return rtnl_offload_xstats_have_ndo(dev, attr_id) ? + sizeof(struct rtnl_link_stats64) : 0; } -static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, - int *prividx) +static int +rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id, + struct sk_buff *skb) { + unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id); struct nlattr *attr = NULL; - int attr_id, size; void *attr_data; int err; - if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && - dev->netdev_ops->ndo_get_offload_stats)) + if (!size) return -ENODATA; - for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; - attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { - if (attr_id < *prividx) - continue; + attr = nla_reserve_64bit(skb, attr_id, size, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + return -EMSGSIZE; - size = rtnl_get_offload_stats_attr_size(attr_id); - if (!size) - continue; + attr_data = nla_data(attr); + memset(attr_data, 0, size); - if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) - continue; + err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); + if (err) + return err; + + return 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_stats(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + bool enabled = netdev_offload_xstats_enabled(dev, type); + + return enabled ? sizeof(struct rtnl_hw_stats64) : 0; +} + +struct rtnl_offload_xstats_request_used { + bool request; + bool used; +}; + +static int +rtnl_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_offload_xstats_request_used *ru, + struct rtnl_hw_stats64 *stats, + struct netlink_ext_ack *extack) +{ + bool request; + bool used; + int err; + + request = netdev_offload_xstats_enabled(dev, type); + if (!request) { + used = false; + goto out; + } + + err = netdev_offload_xstats_get(dev, type, stats, &used, extack); + if (err) + return err; + +out: + if (ru) { + ru->request = request; + ru->used = used; + } + return 0; +} + +static int +rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id, + struct rtnl_offload_xstats_request_used *ru) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, attr_id); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request)) + goto nla_put_failure; + + if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; - attr = nla_reserve_64bit(skb, attr_id, size, +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int +rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + struct rtnl_offload_xstats_request_used ru_l3; + struct nlattr *nest; + int err; + + err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack); + if (err) + return err; + + nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO); + if (!nest) + return -EMSGSIZE; + + if (rtnl_offload_xstats_fill_hw_s_info_one(skb, + IFLA_OFFLOAD_XSTATS_L3_STATS, + &ru_l3)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev, + int *prividx, u32 off_filter_mask, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO; + int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS; + int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; + bool have_data = false; + int err; + + if (*prividx <= attr_id_cpu_hit && + (off_filter_mask & + IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) { + err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb); + if (!err) { + have_data = true; + } else if (err != -ENODATA) { + *prividx = attr_id_cpu_hit; + return err; + } + } + + if (*prividx <= attr_id_hw_s_info && + (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) { + *prividx = attr_id_hw_s_info; + + err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack); + if (err) + return err; + + have_data = true; + *prividx = 0; + } + + if (*prividx <= attr_id_l3_stats && + (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) { + unsigned int size_l3; + struct nlattr *attr; + + *prividx = attr_id_l3_stats; + + size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3); + if (!size_l3) + goto skip_l3_stats; + attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) - goto nla_put_failure; + return -EMSGSIZE; - attr_data = nla_data(attr); - memset(attr_data, 0, size); - err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, - attr_data); + err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL, + nla_data(attr), extack); if (err) - goto get_offload_stats_failure; + return err; + + have_data = true; +skip_l3_stats: + *prividx = 0; } - if (!attr) + if (!have_data) return -ENODATA; *prividx = 0; return 0; +} -nla_put_failure: - err = -EMSGSIZE; -get_offload_stats_failure: - *prividx = attr_id; - return err; +static unsigned int +rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + bool enabled = netdev_offload_xstats_enabled(dev, type); + + return nla_total_size(0) + + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */ + nla_total_size(sizeof(u8)) + + /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */ + (enabled ? nla_total_size(sizeof(u8)) : 0) + + 0; +} + +static unsigned int +rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + + return nla_total_size(0) + + /* IFLA_OFFLOAD_XSTATS_L3_STATS */ + rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) + + 0; } -static int rtnl_get_offload_stats_size(const struct net_device *dev) +static int rtnl_offload_xstats_get_size(const struct net_device *dev, + u32 off_filter_mask) { + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT; int nla_size = 0; - int attr_id; int size; - if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && - dev->netdev_ops->ndo_get_offload_stats)) - return 0; + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) { + size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit); + nla_size += nla_total_size_64bit(size); + } - for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; - attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { - if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) - continue; - size = rtnl_get_offload_stats_attr_size(attr_id); + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO)) + nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev); + + if (off_filter_mask & + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) { + size = rtnl_offload_xstats_get_size_stats(dev, t_l3); nla_size += nla_total_size_64bit(size); } @@ -5108,11 +5412,21 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev) return nla_size; } +struct rtnl_stats_dump_filters { + /* mask[0] filters outer attributes. Then individual nests have their + * filtering mask at the index of the nested attribute. + */ + u32 mask[IFLA_STATS_MAX + 1]; +}; + static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, unsigned int filter_mask, - int *idxattr, int *prividx) + unsigned int flags, + const struct rtnl_stats_dump_filters *filters, + int *idxattr, int *prividx, + struct netlink_ext_ack *extack) { + unsigned int filter_mask = filters->mask[0]; struct if_stats_msg *ifsm; struct nlmsghdr *nlh; struct nlattr *attr; @@ -5138,8 +5452,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, sizeof(struct rtnl_link_stats64), IFLA_STATS_UNSPEC); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } sp = nla_data(attr); dev_get_stats(dev, sp); @@ -5152,8 +5468,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); @@ -5175,8 +5493,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS_SLAVE); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); @@ -5188,13 +5508,19 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, *idxattr)) { + u32 off_filter_mask; + + off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } - err = rtnl_get_offload_stats(skb, dev, prividx); + err = rtnl_offload_xstats_fill(skb, dev, prividx, + off_filter_mask, extack); if (err == -ENODATA) nla_nest_cancel(skb, attr); else @@ -5210,19 +5536,21 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, *idxattr = IFLA_STATS_AF_SPEC; attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); - if (!attr) + if (!attr) { + err = -EMSGSIZE; goto nla_put_failure; + } rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; - int err; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) { rcu_read_unlock(); + err = -EMSGSIZE; goto nla_put_failure; } err = af_ops->fill_stats_af(skb, dev); @@ -5255,13 +5583,14 @@ nla_put_failure: else nlmsg_end(skb, nlh); - return -EMSGSIZE; + return err; } static size_t if_nlmsg_stats_size(const struct net_device *dev, - u32 filter_mask) + const struct rtnl_stats_dump_filters *filters) { size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); + unsigned int filter_mask = filters->mask[0]; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); @@ -5297,8 +5626,12 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, } } - if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) - size += rtnl_get_offload_stats_size(dev); + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) { + u32 off_filter_mask; + + off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS]; + size += rtnl_offload_xstats_get_size(dev, off_filter_mask); + } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { struct rtnl_af_ops *af_ops; @@ -5322,6 +5655,79 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, return size; } +#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1) + +static const struct nla_policy +rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = { + [IFLA_STATS_LINK_OFFLOAD_XSTATS] = + NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID), +}; + +static const struct nla_policy +rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = { + [IFLA_STATS_GET_FILTERS] = + NLA_POLICY_NESTED(rtnl_stats_get_policy_filters), +}; + +static const struct nla_policy +ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = { + [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1), +}; + +static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters, + struct rtnl_stats_dump_filters *filters, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_STATS_MAX + 1]; + int err; + int at; + + err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters, + rtnl_stats_get_policy_filters, extack); + if (err < 0) + return err; + + for (at = 1; at <= IFLA_STATS_MAX; at++) { + if (tb[at]) { + if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) { + NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask"); + return -EINVAL; + } + filters->mask[at] = nla_get_u32(tb[at]); + } + } + + return 0; +} + +static int rtnl_stats_get_parse(const struct nlmsghdr *nlh, + u32 filter_mask, + struct rtnl_stats_dump_filters *filters, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; + int err; + int i; + + filters->mask[0] = filter_mask; + for (i = 1; i < ARRAY_SIZE(filters->mask); i++) + filters->mask[i] = -1U; + + err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb, + IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack); + if (err < 0) + return err; + + if (tb[IFLA_STATS_GET_FILTERS]) { + err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS], + filters, extack); + if (err) + return err; + } + + return 0; +} + static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, bool is_dump, struct netlink_ext_ack *extack) { @@ -5344,10 +5750,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); return -EINVAL; } - if (nlmsg_attrlen(nlh, sizeof(*ifsm))) { - NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); - return -EINVAL; - } if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); return -EINVAL; @@ -5359,12 +5761,12 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; int idxattr = 0, prividx = 0; struct if_stats_msg *ifsm; struct sk_buff *nskb; - u32 filter_mask; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), @@ -5381,17 +5783,22 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, if (!dev) return -ENODEV; - filter_mask = ifsm->filter_mask; - if (!filter_mask) + if (!ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get"); return -EINVAL; + } + + err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack); + if (err) + return err; - nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL); + nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL); if (!nskb) return -ENOBUFS; err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - 0, filter_mask, &idxattr, &prividx); + 0, &filters, &idxattr, &prividx, extack); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ WARN_ON(err == -EMSGSIZE); @@ -5407,12 +5814,12 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; int h, s_h, err, s_idx, s_idxattr, s_prividx; + struct rtnl_stats_dump_filters filters; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; struct hlist_head *head; struct net_device *dev; - u32 filter_mask = 0; int idx = 0; s_h = cb->args[0]; @@ -5427,12 +5834,16 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) return err; ifsm = nlmsg_data(cb->nlh); - filter_mask = ifsm->filter_mask; - if (!filter_mask) { + if (!ifsm->filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; } + err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters, + extack); + if (err) + return err; + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; @@ -5442,8 +5853,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, - flags, filter_mask, - &s_idxattr, &s_prividx); + flags, &filters, + &s_idxattr, &s_prividx, + extack); /* If we ran out of room on the first message, * we're in trouble */ @@ -5467,6 +5879,107 @@ out: return skb->len; } +void rtnl_offload_xstats_notify(struct net_device *dev) +{ + struct rtnl_stats_dump_filters response_filters = {}; + struct net *net = dev_net(dev); + int idxattr = 0, prividx = 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + ASSERT_RTNL(); + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); + + skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), + GFP_KERNEL); + if (!skb) + goto errout; + + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0, + &response_filters, &idxattr, &prividx, NULL); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL); + return; + +errout: + rtnl_set_sk_err(net, RTNLGRP_STATS, err); +} +EXPORT_SYMBOL(rtnl_offload_xstats_notify); + +static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; + struct rtnl_stats_dump_filters response_filters = {}; + struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct net_device *dev = NULL; + struct if_stats_msg *ifsm; + bool notify = false; + int err; + + err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), + false, extack); + if (err) + return err; + + ifsm = nlmsg_data(nlh); + if (ifsm->family != AF_UNSPEC) { + NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC"); + return -EINVAL; + } + + if (ifsm->ifindex > 0) + dev = __dev_get_by_index(net, ifsm->ifindex); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + if (ifsm->filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set"); + return -EINVAL; + } + + err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX, + ifla_stats_set_policy, extack); + if (err < 0) + return err; + + if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) { + u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]); + + if (req) + err = netdev_offload_xstats_enable(dev, t_l3, extack); + else + err = netdev_offload_xstats_disable(dev, t_l3); + + if (!err) + notify = true; + else if (err != -EALREADY) + return err; + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); + } + + if (notify) + rtnl_offload_xstats_notify(dev); + + return 0; +} + /* Process one rtnetlink message. */ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -5474,11 +5987,11 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct rtnl_link *link; + enum rtnl_kinds kind; struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; - int kind; int family; int type; @@ -5493,13 +6006,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; - kind = type&3; + kind = rtnl_msgtype_kind(type); - if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) + if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; rcu_read_lock(); - if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) { struct sock *rtnl; rtnl_dumpit_func dumpit; u32 min_dump_alloc = 0; @@ -5555,6 +6068,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } flags = link->flags; + if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) && + !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) { + NL_SET_ERR_MSG(extack, "Bulk delete is not supported"); + module_put(owner); + goto err_unlock; + } + if (flags & RTNL_FLAG_DOIT_UNLOCKED) { doit = link->doit; rcu_read_unlock(); @@ -5683,7 +6203,8 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, + RTNL_FLAG_BULK_DEL_SUPPORTED); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); @@ -5692,4 +6213,5 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, 0); + rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0); } diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index b5bc680d4755..b0ff6153be62 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -19,8 +19,10 @@ #include <linux/in6.h> #include <net/tcp.h> -static siphash_key_t net_secret __read_mostly; -static siphash_key_t ts_secret __read_mostly; +static siphash_aligned_key_t net_secret; +static siphash_aligned_key_t ts_secret; + +#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ) static __always_inline void net_secret_init(void) { @@ -62,7 +64,7 @@ u32 secure_tcpv6_ts_off(const struct net *net, .daddr = *(struct in6_addr *)daddr, }; - if (net->ipv4.sysctl_tcp_timestamps != 1) + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) return 0; ts_secret_init(); @@ -94,17 +96,19 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, } EXPORT_SYMBOL(secure_tcpv6_seq); -u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, +u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) { const struct { struct in6_addr saddr; struct in6_addr daddr; + unsigned int timeseed; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *(struct in6_addr *)saddr, .daddr = *(struct in6_addr *)daddr, - .dport = dport + .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + .dport = dport, }; net_secret_init(); return siphash(&combined, offsetofend(typeof(combined), dport), @@ -116,7 +120,7 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #ifdef CONFIG_INET u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) { - if (net->ipv4.sysctl_tcp_timestamps != 1) + if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) return 0; ts_secret_init(); @@ -142,11 +146,13 @@ u32 secure_tcp_seq(__be32 saddr, __be32 daddr, } EXPORT_SYMBOL_GPL(secure_tcp_seq); -u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { net_secret_init(); - return siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u16)dport, &net_secret); + return siphash_4u32((__force u32)saddr, (__force u32)daddr, + (__force u16)dport, + jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + &net_secret); } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ba2f38246f07..88fa40571d0c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -80,7 +80,7 @@ #include <linux/user_namespace.h> #include <linux/indirect_call_wrapper.h> -#include "datagram.h" +#include "dev.h" #include "sock_destructor.h" struct kmem_cache *skbuff_head_cache __ro_after_init; @@ -91,6 +91,13 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); +#undef FN +#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, +const char * const drop_reasons[] = { + DEFINE_DROP_REASON(FN, FN) +}; +EXPORT_SYMBOL(drop_reasons); + /** * skb_panic - private function for out-of-line support * @skb: buffer @@ -127,8 +134,66 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) +#if PAGE_SIZE == SZ_4K + +#define NAPI_HAS_SMALL_PAGE_FRAG 1 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) + +/* specialized page frag allocator using a single order 0 page + * and slicing it into 1K sized fragment. Constrained to systems + * with a very limited amount of 1K fragments fitting a single + * page - to avoid excessive truesize underestimation + */ + +struct page_frag_1k { + void *va; + u16 offset; + bool pfmemalloc; +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) +{ + struct page *page; + int offset; + + offset = nc->offset - SZ_1K; + if (likely(offset >= 0)) + goto use_frag; + + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + if (!page) + return NULL; + + nc->va = page_address(page); + nc->pfmemalloc = page_is_pfmemalloc(page); + offset = PAGE_SIZE - SZ_1K; + page_ref_add(page, offset / SZ_1K); + +use_frag: + nc->offset = offset; + return nc->va + offset; +} +#else + +/* the small page is actually unused in this build; add dummy helpers + * to please the compiler and avoid later preprocessor's conditionals + */ +#define NAPI_HAS_SMALL_PAGE_FRAG 0 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false + +struct page_frag_1k { +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) +{ + return NULL; +} + +#endif + struct napi_alloc_cache { struct page_frag_cache page; + struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -136,6 +201,23 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -172,13 +254,14 @@ static struct sk_buff *napi_skb_cache_get(void) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; - if (unlikely(!nc->skb_count)) + if (unlikely(!nc->skb_count)) { nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, GFP_ATOMIC, NAPI_SKB_CACHE_BULK, nc->skb_cache); - if (unlikely(!nc->skb_count)) - return NULL; + if (unlikely(!nc->skb_count)) + return NULL; + } skb = nc->skb_cache[--nc->skb_count]; kasan_unpoison_object_data(skbuff_head_cache, skb); @@ -201,10 +284,10 @@ static void __build_skb_around(struct sk_buff *skb, void *data, skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; + skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; - + skb->alloc_cpu = raw_smp_processor_id(); /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); @@ -450,8 +533,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->fclone = SKB_FCLONE_ORIG; refcount_set(&fclones->fclone_ref, 1); - - fclones->skb2.fclone = SKB_FCLONE_CLONE; } return skb; @@ -555,14 +636,18 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, { struct napi_alloc_cache *nc; struct sk_buff *skb; + bool pfmemalloc; void *data; + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); len += NET_SKB_PAD + NET_IP_ALIGN; /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. + * When the small frag allocator is available, prefer it over kmalloc + * for small fragments */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -573,13 +658,33 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } nc = this_cpu_ptr(&napi_alloc_cache); - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = page_frag_alloc(&nc->page, len, gfp_mask); + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { + /* we are artificially inflating the allocation size, but + * that is not as bad as it may look like, as: + * - 'len' less than GRO_MAX_HEAD makes little sense + * - On most systems, larger 'len' values lead to fragment + * size above 512 bytes + * - kmalloc would use the kmalloc-1k slab for such values + * - Builds with smaller GRO_MAX_HEAD will very likely do + * little networking, as that implies no WiFi and no + * tunnels support, and 32 bits arches. + */ + len = SZ_1K; + + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); + pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); + } else { + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); + + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = nc->page.pfmemalloc; + } + if (unlikely(!data)) return NULL; @@ -589,7 +694,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, return NULL; } - if (nc->page.pfmemalloc) + if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -666,11 +771,18 @@ static void skb_release_data(struct sk_buff *skb) &shinfo->dataref)) goto exit; - skb_zcopy_clear(skb, true); + if (skb_zcopy(skb)) { + bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; + + skb_zcopy_clear(skb, true); + if (skip_unref) + goto free_head; + } for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); +free_head: if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); @@ -681,7 +793,7 @@ exit: * while trying to recycle fragments on __skb_frag_unref() we need * to make one SKB responsible for triggering the recycle path. * So disable the recycling bit if an SKB is cloned and we have - * additional references to to the fragmented part of the SKB. + * additional references to the fragmented part of the SKB. * Eventually the last SKB will have the recycling bit set and it's * dataref set to 0, which will trigger the recycling */ @@ -725,7 +837,7 @@ void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); if (skb->destructor) { - WARN_ON(in_hardirq()); + DEBUG_NET_WARN_ON_ONCE(in_hardirq()); skb->destructor(skb); } #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -759,32 +871,38 @@ void __kfree_skb(struct sk_buff *skb) EXPORT_SYMBOL(__kfree_skb); /** - * kfree_skb - free an sk_buff + * kfree_skb_reason - free an sk_buff with special reason * @skb: buffer to free + * @reason: reason why this skb is dropped * * Drop a reference to the buffer and free it if the usage count has - * hit zero. + * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' + * tracepoint. */ -void kfree_skb(struct sk_buff *skb) +void __fix_address +kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) { - if (!skb_unref(skb)) + if (unlikely(!skb_unref(skb))) return; - trace_kfree_skb(skb, __builtin_return_address(0)); + DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + + trace_kfree_skb(skb, __builtin_return_address(0), reason); __kfree_skb(skb); } -EXPORT_SYMBOL(kfree_skb); +EXPORT_SYMBOL(kfree_skb_reason); -void kfree_skb_list(struct sk_buff *segs) +void kfree_skb_list_reason(struct sk_buff *segs, + enum skb_drop_reason reason) { while (segs) { struct sk_buff *next = segs->next; - kfree_skb(segs); + kfree_skb_reason(segs, reason); segs = next; } } -EXPORT_SYMBOL(kfree_skb_list); +EXPORT_SYMBOL(kfree_skb_list_reason); /* Dump skb information and contents. * @@ -832,7 +950,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); if (dev) - printk("%sdev name=%s feat=0x%pNF\n", + printk("%sdev name=%s feat=%pNF\n", level, dev->name, &dev->features); if (sk) printk("%ssk family=%hu type=%u proto=%u\n", @@ -890,7 +1008,10 @@ EXPORT_SYMBOL(skb_dump); */ void skb_tx_error(struct sk_buff *skb) { - skb_zcopy_clear(skb, true); + if (skb) { + skb_zcopy_downgrade_managed(skb); + skb_zcopy_clear(skb, true); + } } EXPORT_SYMBOL(skb_tx_error); @@ -973,7 +1094,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - lockdep_assert_in_softirq(); + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); if (!skb_unref(skb)) return; @@ -992,12 +1113,10 @@ void napi_consume_skb(struct sk_buff *skb, int budget) } EXPORT_SYMBOL(napi_consume_skb); -/* Make sure a field is enclosed inside headers_start/headers_end section */ +/* Make sure a field is contained by headers group */ #define CHECK_SKB_FIELD(field) \ - BUILD_BUG_ON(offsetof(struct sk_buff, field) < \ - offsetof(struct sk_buff, headers_start)); \ - BUILD_BUG_ON(offsetof(struct sk_buff, field) > \ - offsetof(struct sk_buff, headers_end)); \ + BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ + offsetof(struct sk_buff, headers.field)); \ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { @@ -1009,14 +1128,12 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) __skb_ext_copy(new, old); __nf_copy(new, old, false); - /* Note : this field could be in headers_start/headers_end section + /* Note : this field could be in the headers group. * It is not yet because we do not want to have a 16 bit hole */ new->queue_mapping = old->queue_mapping; - memcpy(&new->headers_start, &old->headers_start, - offsetof(struct sk_buff, headers_end) - - offsetof(struct sk_buff, headers_start)); + memcpy(&new->headers, &old->headers, sizeof(new->headers)); CHECK_SKB_FIELD(protocol); CHECK_SKB_FIELD(csum); CHECK_SKB_FIELD(hash); @@ -1038,6 +1155,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #ifdef CONFIG_NET_RX_BUSY_POLL CHECK_SKB_FIELD(napi_id); #endif + CHECK_SKB_FIELD(alloc_cpu); #ifdef CONFIG_XPS CHECK_SKB_FIELD(sender_cpu); #endif @@ -1166,9 +1284,9 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) { - struct ubuf_info *uarg; + struct ubuf_info_msgzc *uarg; struct sk_buff *skb; WARN_ON_ONCE(!in_task()); @@ -1186,20 +1304,19 @@ struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) return NULL; } - uarg->callback = msg_zerocopy_callback; + uarg->ubuf.callback = msg_zerocopy_callback; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; - uarg->flags = SKBFL_ZEROCOPY_FRAG; - refcount_set(&uarg->refcnt, 1); + uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + refcount_set(&uarg->ubuf.refcnt, 1); sock_hold(sk); - return uarg; + return &uarg->ubuf; } -EXPORT_SYMBOL_GPL(msg_zerocopy_alloc); -static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) +static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) { return container_of((void *)uarg, struct sk_buff, cb); } @@ -1208,9 +1325,14 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg) { if (uarg) { + struct ubuf_info_msgzc *uarg_zc; const u32 byte_limit = 1 << 19; /* limit to a few TSO */ u32 bytelen, next; + /* there might be non MSG_ZEROCOPY users */ + if (uarg->callback != msg_zerocopy_callback) + return NULL; + /* realloc only when socket is locked (TCP, UDP cork), * so uarg->len and sk_zckey access is serialized */ @@ -1219,8 +1341,9 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, return NULL; } - bytelen = uarg->bytelen + size; - if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { + uarg_zc = uarg_to_msgzc(uarg); + bytelen = uarg_zc->bytelen + size; + if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { /* TCP can create new skb to attach new uarg */ if (sk->sk_type == SOCK_STREAM) goto new_alloc; @@ -1228,11 +1351,11 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } next = (u32)atomic_read(&sk->sk_zckey); - if ((u32)(uarg->id + uarg->len) == next) { - if (mm_account_pinned_pages(&uarg->mmp, size)) + if ((u32)(uarg_zc->id + uarg_zc->len) == next) { + if (mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; - uarg->len++; - uarg->bytelen = bytelen; + uarg_zc->len++; + uarg_zc->bytelen = bytelen; atomic_set(&sk->sk_zckey, ++next); /* no extra ref when appending to datagram (MSG_MORE) */ @@ -1268,7 +1391,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) return true; } -static void __msg_zerocopy_callback(struct ubuf_info *uarg) +static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) { struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; @@ -1321,37 +1444,32 @@ release: void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success) { - uarg->zerocopy = uarg->zerocopy & success; + struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); + + uarg_zc->zerocopy = uarg_zc->zerocopy & success; if (refcount_dec_and_test(&uarg->refcnt)) - __msg_zerocopy_callback(uarg); + __msg_zerocopy_callback(uarg_zc); } EXPORT_SYMBOL_GPL(msg_zerocopy_callback); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { - struct sock *sk = skb_from_uarg(uarg)->sk; + struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; atomic_dec(&sk->sk_zckey); - uarg->len--; + uarg_to_msgzc(uarg)->len--; if (have_uref) msg_zerocopy_callback(NULL, uarg, true); } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); -int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) -{ - return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); -} -EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); - int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) { struct ubuf_info *orig_uarg = skb_zcopy(skb); - struct iov_iter orig_iter = msg->msg_iter; int err, orig_len = skb->len; /* An skb can only point to one uarg. This edge case happens when @@ -1360,12 +1478,12 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, if (orig_uarg && uarg != orig_uarg) return -EEXIST; - err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; /* Streams do not free skb on error. Reset to prev state. */ - msg->msg_iter = orig_iter; + iov_iter_revert(&msg->msg_iter, skb->len - orig_len); skb->sk = sk; ___pskb_trim(skb, orig_len); skb->sk = save_sk; @@ -1377,6 +1495,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); +void __skb_zcopy_downgrade_managed(struct sk_buff *skb) +{ + int i; + + skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); +} +EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); + static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, gfp_t gfp_mask) { @@ -1514,6 +1642,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) refcount_read(&fclones->fclone_ref) == 1) { n = &fclones->skb2; refcount_set(&fclones->fclone_ref, 2); + n->fclone = SKB_FCLONE_CLONE; } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -1694,6 +1823,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, BUG_ON(skb_shared(skb)); + skb_zcopy_downgrade_managed(skb); + size = SKB_DATA_ALIGN(size); if (skb_pfmemalloc(skb)) @@ -1738,11 +1869,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->head = data; skb->head_frag = 0; skb->data += off; + + skb_set_end_offset(skb, size); #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; off = nhead; -#else - skb->end = skb->head + size; #endif skb->tail += off; skb_headers_offset_update(skb, nhead); @@ -1790,6 +1920,38 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } EXPORT_SYMBOL(skb_realloc_headroom); +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + unsigned int saved_end_offset, saved_truesize; + struct skb_shared_info *shinfo; + int res; + + saved_end_offset = skb_end_offset(skb); + saved_truesize = skb->truesize; + + res = pskb_expand_head(skb, 0, 0, pri); + if (res) + return res; + + skb->truesize = saved_truesize; + + if (likely(skb_end_offset(skb) == saved_end_offset)) + return 0; + + shinfo = skb_shinfo(skb); + + /* We are about to change back skb->end, + * we need to move skb_shinfo() to its new location. + */ + memmove(skb->head + saved_end_offset, + shinfo, + offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); + + skb_set_end_offset(skb, saved_end_offset); + + return 0; +} + /** * skb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate @@ -2028,6 +2190,30 @@ void *skb_pull(struct sk_buff *skb, unsigned int len) EXPORT_SYMBOL(skb_pull); /** + * skb_pull_data - remove data from the start of a buffer returning its + * original position. + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the original data in the buffer + * is returned after checking if there is enough data to pull. Once the + * data has been pulled future pushes will overwrite the old data. + */ +void *skb_pull_data(struct sk_buff *skb, size_t len) +{ + void *data = skb->data; + + if (skb->len < len) + return NULL; + + skb_pull(skb, len); + + return data; +} +EXPORT_SYMBOL(skb_pull_data); + +/** * skb_trim - remove end from a buffer * @skb: buffer to alter * @len: new length @@ -2254,7 +2440,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { @@ -3141,9 +3327,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) } } - to->truesize += len + plen; - to->len += len + plen; - to->data_len += len + plen; + skb_len_add(to, len + plen); if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { skb_tx_error(from); @@ -3435,6 +3619,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) int pos = skb_headlen(skb); const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; + skb_zcopy_downgrade_managed(skb); + skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ @@ -3580,13 +3766,8 @@ onlymerged: tgt->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL; - /* Yak, is it really working this way? Some helper please? */ - skb->len -= shiftlen; - skb->data_len -= shiftlen; - skb->truesize -= shiftlen; - tgt->len += shiftlen; - tgt->data_len += shiftlen; - tgt->truesize += shiftlen; + skb_len_add(skb, -shiftlen); + skb_len_add(tgt, shiftlen); return shiftlen; } @@ -3788,8 +3969,9 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page, if (skb_can_coalesce(skb, i, page, offset)) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); } else if (i < MAX_SKB_FRAGS) { + skb_zcopy_downgrade_managed(skb); get_page(page); - skb_fill_page_desc(skb, i, page, offset, size); + skb_fill_page_desc_noacc(skb, i, page, offset, size); } else { return -EMSGSIZE; } @@ -3843,7 +4025,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int delta_len = 0; struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; - int err; + int len_diff, err; skb_push(skb, -skb_network_offset(skb) + offset); @@ -3854,6 +4036,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, list_skb = list_skb->next; err = 0; + delta_truesize += nskb->truesize; if (skb_shared(nskb)) { tmp = skb_clone(nskb, GFP_ATOMIC); if (tmp) { @@ -3878,14 +4061,15 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, tail = nskb; delta_len += nskb->len; - delta_truesize += nskb->truesize; skb_push(nskb, -skb_network_offset(nskb) + offset); skb_release_head_state(nskb); + len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + nskb->transport_header += len_diff; skb_copy_from_linear_data_offset(skb, -tnl_hlen, nskb->data - tnl_hlen, offset + tnl_hlen); @@ -3919,32 +4103,6 @@ err_linearize: } EXPORT_SYMBOL_GPL(skb_segment_list); -int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) -{ - if (unlikely(p->len + skb->len >= 65536)) - return -E2BIG; - - if (NAPI_GRO_CB(p)->last == p) - skb_shinfo(p)->frag_list = skb; - else - NAPI_GRO_CB(p)->last->next = skb; - - skb_pull(skb, skb_gro_offset(skb)); - - NAPI_GRO_CB(p)->last = skb; - NAPI_GRO_CB(p)->count++; - p->data_len += skb->len; - - /* sk owenrship - if any - completely transferred to the aggregated packet */ - skb->destructor = NULL; - p->truesize += skb->truesize; - p->len += skb->len; - - NAPI_GRO_CB(skb)->same_flow = 1; - - return 0; -} - /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment @@ -3976,23 +4134,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int i = 0; int pos; - if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && - (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { - /* gso_size is untrusted, and we have a frag_list with a linear - * non head_frag head. - * - * (we assume checking the first list_skb member suffices; - * i.e if either of the list_skb members have non head_frag - * head, then the first one has too). - * - * If head_skb's headlen does not fit requested gso_size, it - * means that the frag_list members do NOT terminate on exact - * gso_size boundaries. Hence we cannot perform skb_frag_t page - * sharing. Therefore we must fallback to copying the frag_list - * skbs; we do so by disabling SG. - */ - if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) - features &= ~NETIF_F_SG; + if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && + mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { + struct sk_buff *check_skb; + + for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { + if (skb_headlen(check_skb) && !check_skb->head_frag) { + /* gso_size is untrusted, and we have a frag_list with + * a linear non head_frag item. + * + * If head_skb's headlen does not fit requested gso_size, + * it means that the frag_list members do NOT terminate + * on exact gso_size boundaries. Hence we cannot perform + * skb_frag_t page sharing. Therefore we must fallback to + * copying the frag_list skbs; we do so by disabling SG. + */ + features &= ~NETIF_F_SG; + break; + } + } } __skb_push(head_skb, doffset); @@ -4154,9 +4314,8 @@ normal: SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; } else { - skb_copy_bits(head_skb, offset, - skb_put(nskb, len), - len); + if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) + goto err; } continue; } @@ -4297,122 +4456,6 @@ err: } EXPORT_SYMBOL_GPL(skb_segment); -int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) -{ - struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); - unsigned int offset = skb_gro_offset(skb); - unsigned int headlen = skb_headlen(skb); - unsigned int len = skb_gro_len(skb); - unsigned int delta_truesize; - unsigned int new_truesize; - struct sk_buff *lp; - - if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) - return -E2BIG; - - lp = NAPI_GRO_CB(p)->last; - pinfo = skb_shinfo(lp); - - if (headlen <= offset) { - skb_frag_t *frag; - skb_frag_t *frag2; - int i = skbinfo->nr_frags; - int nr_frags = pinfo->nr_frags + i; - - if (nr_frags > MAX_SKB_FRAGS) - goto merge; - - offset -= headlen; - pinfo->nr_frags = nr_frags; - skbinfo->nr_frags = 0; - - frag = pinfo->frags + nr_frags; - frag2 = skbinfo->frags + i; - do { - *--frag = *--frag2; - } while (--i); - - skb_frag_off_add(frag, offset); - skb_frag_size_sub(frag, offset); - - /* all fragments truesize : remove (head size + sk_buff) */ - new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); - delta_truesize = skb->truesize - new_truesize; - - skb->truesize = new_truesize; - skb->len -= skb->data_len; - skb->data_len = 0; - - NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; - goto done; - } else if (skb->head_frag) { - int nr_frags = pinfo->nr_frags; - skb_frag_t *frag = pinfo->frags + nr_frags; - struct page *page = virt_to_head_page(skb->head); - unsigned int first_size = headlen - offset; - unsigned int first_offset; - - if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) - goto merge; - - first_offset = skb->data - - (unsigned char *)page_address(page) + - offset; - - pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - - __skb_frag_set_page(frag, page); - skb_frag_off_set(frag, first_offset); - skb_frag_size_set(frag, first_size); - - memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); - /* We dont need to clear skbinfo->nr_frags here */ - - new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); - delta_truesize = skb->truesize - new_truesize; - skb->truesize = new_truesize; - NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; - goto done; - } - -merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ - skb->destructor = NULL; - delta_truesize = skb->truesize; - if (offset > headlen) { - unsigned int eat = offset - headlen; - - skb_frag_off_add(&skbinfo->frags[0], eat); - skb_frag_size_sub(&skbinfo->frags[0], eat); - skb->data_len -= eat; - skb->len -= eat; - offset = headlen; - } - - __skb_pull(skb, offset); - - if (NAPI_GRO_CB(p)->last == p) - skb_shinfo(p)->frag_list = skb; - else - NAPI_GRO_CB(p)->last->next = skb; - NAPI_GRO_CB(p)->last = skb; - __skb_header_release(skb); - lp = p; - -done: - NAPI_GRO_CB(p)->count++; - p->data_len += len; - p->truesize += delta_truesize; - p->len += len; - if (lp != p) { - lp->data_len += len; - lp->truesize += delta_truesize; - lp->len += len; - } - NAPI_GRO_CB(skb)->same_flow = 1; - return 0; -} - #ifdef CONFIG_SKB_EXTENSIONS #define SKB_EXT_ALIGN_VALUE 8 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) @@ -4849,9 +4892,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) - serr->ee.ee_data -= sk->sk_tskey; + if (sk_is_tcp(sk)) + serr->ee.ee_data -= atomic_read(&sk->sk_tskey); } err = sock_queue_err_skb(sk, skb); @@ -4864,7 +4906,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(sysctl_tstamp_allow_data || tsonly)) + if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) return true; read_lock_bh(&sk->sk_callback_lock); @@ -4919,8 +4961,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (tsonly) { #ifdef CONFIG_INET if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && - sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) { + sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); opt_stats = true; @@ -4942,7 +4983,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (hwtstamps) *skb_hwtstamps(skb) = *hwtstamps; else - skb->tstamp = ktime_get_real(); + __net_timestamp(skb); __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } @@ -5366,11 +5407,18 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_cloned(to)) return false; - /* The page pool signature of struct page will eventually figure out - * which pages can be recycled or not but for now let's prohibit slab - * allocated and page_pool allocated SKBs from being coalesced. + /* In general, avoid mixing slab allocated and page_pool allocated + * pages within the same SKB. However when @to is not pp_recycle and + * @from is cloned, we can transition frag pages from page_pool to + * reference counted. + * + * On the other hand, don't allow coalescing two pp_recycle SKBs if + * @from is cloned, in case the SKB is using page_pool fragment + * references (PP_FLAG_PAGE_FRAG). Since we only take full page + * references for cloned SKBs at the moment that would result in + * inconsistent reference counts. */ - if (to->pp_recycle != from->pp_recycle) + if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) return false; if (len <= skb_tailroom(to)) { @@ -5472,7 +5520,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) ipvs_reset(skb); skb->mark = 0; - skb->tstamp = 0; + skb_clear_tstamp(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); @@ -5684,7 +5732,7 @@ err_free: } EXPORT_SYMBOL(skb_vlan_untag); -int skb_ensure_writable(struct sk_buff *skb, int write_len) +int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) { if (!pskb_may_pull(skb, write_len)) return -ENOMEM; @@ -6166,11 +6214,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb->head = data; skb->data = data; skb->head_frag = 0; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_set_tail_pointer(skb, skb_headlen(skb)); skb_headers_offset_update(skb, 0); skb->cloned = 0; @@ -6227,7 +6271,7 @@ static int pskb_carve_frag_list(struct sk_buff *skb, /* Free pulled out fragments. */ while ((list = shinfo->frag_list) != insp) { shinfo->frag_list = list->next; - kfree_skb(list); + consume_skb(list); } /* And insert new clone at head. */ if (clone) { @@ -6308,11 +6352,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb->head = data; skb->head_frag = 0; skb->data = data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_reset_tail_pointer(skb); skb_headers_offset_update(skb, 0); skb->cloned = 0; @@ -6577,3 +6617,49 @@ free_now: } EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ + +/** + * skb_attempt_defer_free - queue skb for remote freeing + * @skb: buffer + * + * Put @skb in a per-cpu list, using the cpu which + * allocated the skb/pages to reduce false sharing + * and memory zone spinlock contention. + */ +void skb_attempt_defer_free(struct sk_buff *skb) +{ + int cpu = skb->alloc_cpu; + struct softnet_data *sd; + unsigned long flags; + unsigned int defer_max; + bool kick; + + if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu) || + cpu == raw_smp_processor_id()) { +nodefer: __kfree_skb(skb); + return; + } + + sd = &per_cpu(softnet_data, cpu); + defer_max = READ_ONCE(sysctl_skb_defer_max); + if (READ_ONCE(sd->defer_count) >= defer_max) + goto nodefer; + + spin_lock_irqsave(&sd->defer_lock, flags); + /* Send an IPI every time queue reaches half capacity. */ + kick = sd->defer_count == (defer_max >> 1); + /* Paired with the READ_ONCE() few lines above */ + WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + + skb->next = sd->defer_list; + /* Paired with READ_ONCE() in skb_defer_free_flush() */ + WRITE_ONCE(sd->defer_list, skb); + spin_unlock_irqrestore(&sd->defer_lock, flags); + + /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU + * if we are unlucky enough (this seems very unlikely). + */ + if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) + smp_call_function_single_async(cpu, &sd->defer_csd); +} diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8eb671c827f9..e6b9ced3eda8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -27,6 +27,7 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce) { struct page_frag *pfrag = sk_page_frag(sk); + u32 osize = msg->sg.size; int ret = 0; len -= msg->sg.size; @@ -35,13 +36,17 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, u32 orig_offset; int use, i; - if (!sk_page_frag_refill(sk, pfrag)) - return -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) { + ret = -ENOMEM; + goto msg_trim; + } orig_offset = pfrag->offset; use = min_t(int, len, pfrag->size - orig_offset); - if (!sk_wmem_schedule(sk, use)) - return -ENOMEM; + if (!sk_wmem_schedule(sk, use)) { + ret = -ENOMEM; + goto msg_trim; + } i = msg->sg.end; sk_msg_iter_var_prev(i); @@ -71,6 +76,10 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, } return ret; + +msg_trim: + sk_msg_trim(sk, msg, osize); + return ret; } EXPORT_SYMBOL_GPL(sk_msg_alloc); @@ -315,14 +324,13 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } - copied = iov_iter_get_pages(from, pages, bytes, maxpages, + copied = iov_iter_get_pages2(from, pages, bytes, maxpages, &offset); if (copied <= 0) { ret = -EFAULT; goto out; } - iov_iter_advance(from, copied); bytes -= copied; msg->sg.size += copied; @@ -426,8 +434,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (copied + copy > len) copy = len - copied; copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; + if (!copy) { + copied = copied ? copied : -EFAULT; + goto out; + } copied += copy; if (likely(!peek)) { @@ -447,13 +457,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, * didn't copy the entire length lets just break. */ if (copy != sge->length) - return copied; + goto out; sk_msg_iter_var_next(i); } if (copied == len) break; - } while (i != msg_rx->sg.end); + } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); if (unlikely(peek)) { msg_rx = sk_psock_next_msg(psock, msg_rx); @@ -463,13 +473,15 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { msg_rx = sk_psock_dequeue_msg(psock); kfree_sk_msg(msg_rx); } msg_rx = sk_psock_peek_msg(psock); } - +out: + if (psock->work_state.skb && copied > 0) + schedule_work(&psock->work); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); @@ -488,23 +500,27 @@ bool sk_msg_is_readable(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_msg_is_readable); -static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, - struct sk_buff *skb) +static struct sk_msg *alloc_sk_msg(gfp_t gfp) { struct sk_msg *msg; - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); + if (unlikely(!msg)) return NULL; + sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); + return msg; +} - if (!sk_rmem_schedule(sk, skb, skb->truesize)) +static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, + struct sk_buff *skb) +{ + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return NULL; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!msg)) + if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - sk_msg_init(msg); - return msg; + return alloc_sk_msg(GFP_KERNEL); } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, @@ -515,16 +531,20 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, { int num_sge, copied; - /* skb linearize may fail with ENOMEM, but lets simply try again - * later if this happens. Under memory pressure we don't want to - * drop the skb. We need to linearize the skb so that the mapping - * in skb_to_sgvec can not error. - */ - if (skb_linearize(skb)) - return -EAGAIN; num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); - if (unlikely(num_sge < 0)) - return num_sge; + if (num_sge < 0) { + /* skb linearize may fail with ENOMEM, but lets simply try again + * later if this happens. Under memory pressure we don't want to + * drop the skb. We need to linearize the skb so that the mapping + * in skb_to_sgvec can not error. + */ + if (skb_linearize(skb)) + return -EAGAIN; + + num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); + if (unlikely(num_sge < 0)) + return num_sge; + } copied = len; msg->sg.start = 0; @@ -577,13 +597,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) { - struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; int err; if (unlikely(!msg)) return -EAGAIN; - sk_msg_init(msg); skb_set_owner_r(skb, sk); err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); if (err < 0) @@ -686,6 +705,11 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); + if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) { + psock = ERR_PTR(-EINVAL); + goto out; + } + if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; @@ -702,6 +726,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) psock->eval = __SK_NONE; psock->sk_proto = prot; psock->saved_unhash = prot->unhash; + psock->saved_destroy = prot->destroy; psock->saved_close = prot->close; psock->saved_write_space = sk->sk_write_space; @@ -717,7 +742,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); refcount_set(&psock->refcnt, 1); - rcu_assign_sk_user_data_nocopy(sk, psock); + __rcu_assign_sk_user_data_with_flags(sk, psock, + SK_USER_DATA_NOCOPY | + SK_USER_DATA_PSOCK); sock_hold(sk); out: @@ -776,16 +803,13 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } -void sk_psock_stop(struct sk_psock *psock, bool wait) +void sk_psock_stop(struct sk_psock *psock) { spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); __sk_psock_zap_ingress(psock); spin_unlock_bh(&psock->ingress_lock); - - if (wait) - cancel_work_sync(&psock->work); } static void sk_psock_done_strp(struct sk_psock *psock); @@ -823,7 +847,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_stop(psock, false); + sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); queue_rcu_work(system_wq, &psock->rwork); @@ -1146,21 +1170,14 @@ static void sk_psock_done_strp(struct sk_psock *psock) } #endif /* CONFIG_BPF_STREAM_PARSER */ -static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, - unsigned int offset, size_t orig_len) +static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = (struct sock *)desc->arg.data; struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; int len = skb->len; - /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ - skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) { - desc->error = -ENOMEM; - return 0; - } + skb_get(skb); rcu_read_lock(); psock = sk_psock(sk); @@ -1173,15 +1190,14 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, if (!prog) prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - skb->sk = sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); - skb->sk = NULL; } - if (sk_psock_verdict_apply(psock, skb, ret) < 0) - len = 0; + ret = sk_psock_verdict_apply(psock, skb, ret); + if (ret < 0) + len = ret; out: rcu_read_unlock(); return len; @@ -1190,16 +1206,10 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; - read_descriptor_t desc; - if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) return; - - desc.arg.data = sk; - desc.error = 0; - desc.count = 1; - - sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); + sock->ops->read_skb(sk, sk_psock_verdict_recv); } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) diff --git a/net/core/sock.c b/net/core/sock.c index 41e91d0f7061..a3ba0358c77c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -141,10 +141,13 @@ #include <linux/ethtool.h> +#include "dev.h" + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); -static void sock_inuse_add(struct net *net, int val); +static void sock_def_write_space_wfree(struct sock *sk); +static void sock_def_write_space(struct sock *sk); /** * sk_ns_capable - General socket capability test @@ -327,7 +330,10 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); noreclaim_flag = memalloc_noreclaim_save(); - ret = sk->sk_backlog_rcv(sk, skb); + ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, + tcp_v6_do_rcv, + tcp_v4_do_rcv, + sk, skb); memalloc_noreclaim_restore(noreclaim_flag); return ret; @@ -502,17 +508,35 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(__sock_queue_rcv_skb); -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason) { + enum skb_drop_reason drop_reason; int err; err = sk_filter(sk, skb); - if (err) - return err; - - return __sock_queue_rcv_skb(sk, skb); + if (err) { + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + goto out; + } + err = __sock_queue_rcv_skb(sk, skb); + switch (err) { + case -ENOMEM: + drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; + break; + case -ENOBUFS: + drop_reason = SKB_DROP_REASON_PROTO_MEM; + break; + default: + drop_reason = SKB_NOT_DROPPED_YET; + break; + } +out: + if (reason) + *reason = drop_reason; + return err; } -EXPORT_SYMBOL(sock_queue_rcv_skb); +EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) @@ -611,7 +635,9 @@ static int sock_bindtoindex_locked(struct sock *sk, int ifindex) if (ifindex < 0) goto out; - sk->sk_bound_dev_if = ifindex; + /* Paired with all READ_ONCE() done locklessly. */ + WRITE_ONCE(sk->sk_bound_dev_if, ifindex); + if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); sk_dst_reset(sk); @@ -677,22 +703,25 @@ static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) goto out; } - return sock_bindtoindex(sk, index, true); + sockopt_lock_sock(sk); + ret = sock_bindtoindex_locked(sk, index); + sockopt_release_sock(sk); out: #endif return ret; } -static int sock_getbindtodevice(struct sock *sk, char __user *optval, - int __user *optlen, int len) +static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES + int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ]; - if (sk->sk_bound_dev_if == 0) { + if (bound_dev_if == 0) { len = 0; goto zero; } @@ -701,19 +730,19 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, if (len < IFNAMSIZ) goto out; - ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); + ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out; len = strlen(devname) + 1; ret = -EFAULT; - if (copy_to_user(optval, devname, len)) + if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; @@ -843,6 +872,8 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) } num = ethtool_get_phc_vclocks(dev, &vclock_index); + dev_put(dev); + for (i = 0; i < num; i++) { if (*(vclock_index + i) == phc_index) { match = true; @@ -872,14 +903,13 @@ int sock_set_timestamping(struct sock *sk, int optname, if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) { + if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; - sk->sk_tskey = tcp_sk(sk)->snd_una; + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { - sk->sk_tskey = 0; + atomic_set(&sk->sk_tskey, 0); } } @@ -963,7 +993,7 @@ EXPORT_SYMBOL(sock_set_mark); static void sock_release_reserved_memory(struct sock *sk, int bytes) { /* Round down bytes to multiple of pages */ - bytes &= ~(SK_MEM_QUANTUM - 1); + bytes = round_down(bytes, PAGE_SIZE); WARN_ON(bytes > sk->sk_reserved_mem); sk->sk_reserved_mem -= bytes; @@ -991,7 +1021,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes) return -ENOMEM; /* pre-charge to forward_alloc */ - allocated = sk_memory_allocated_add(sk, pages); + sk_memory_allocated_add(sk, pages); + allocated = sk_memory_allocated(sk); /* If the system goes into memory pressure with this * precharge, give up and return error. */ @@ -1000,24 +1031,58 @@ static int sock_reserve_memory(struct sock *sk, int bytes) mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); return -ENOMEM; } - sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT; + sk->sk_forward_alloc += pages << PAGE_SHIFT; - sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT; + sk->sk_reserved_mem += pages << PAGE_SHIFT; return 0; } +void sockopt_lock_sock(struct sock *sk) +{ + /* When current->bpf_ctx is set, the setsockopt is called from + * a bpf prog. bpf has ensured the sk lock has been + * acquired before calling setsockopt(). + */ + if (has_current_bpf_ctx()) + return; + + lock_sock(sk); +} +EXPORT_SYMBOL(sockopt_lock_sock); + +void sockopt_release_sock(struct sock *sk) +{ + if (has_current_bpf_ctx()) + return; + + release_sock(sk); +} +EXPORT_SYMBOL(sockopt_release_sock); + +bool sockopt_ns_capable(struct user_namespace *ns, int cap) +{ + return has_current_bpf_ctx() || ns_capable(ns, cap); +} +EXPORT_SYMBOL(sockopt_ns_capable); + +bool sockopt_capable(int cap) +{ + return has_current_bpf_ctx() || capable(cap); +} +EXPORT_SYMBOL(sockopt_capable); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ -int sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; + struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; - struct sock *sk = sock->sk; int val; int valbool; struct linger ling; @@ -1038,11 +1103,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname, valbool = val ? 1 : 0; - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: - if (val && !capable(CAP_NET_ADMIN)) + if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); @@ -1072,7 +1137,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_wmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. @@ -1086,7 +1151,7 @@ set_sndbuf: break; case SO_SNDBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1104,11 +1169,11 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); + __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1135,7 +1200,8 @@ set_sndbuf: case SO_PRIORITY: if ((val >= 0 && val <= 6) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) sk->sk_priority = val; else ret = -EPERM; @@ -1198,7 +1264,7 @@ set_sndbuf: case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - if (sock->ops->set_rcvlowat) + if (sock && sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); @@ -1280,13 +1346,23 @@ set_sndbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } __sock_set_mark(sk, val); break; + case SO_RCVMARK: + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + + sock_valbool_flag(sk, SOCK_RCVMARK, valbool); + break; case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); @@ -1314,7 +1390,7 @@ set_sndbuf: #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: /* allow unprivileged users to decrease the value */ - if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) + if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else { if (val < 0) @@ -1324,13 +1400,13 @@ set_sndbuf: } break; case SO_PREFER_BUSY_POLL: - if (valbool && !capable(CAP_NET_ADMIN)) + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); break; case SO_BUSY_POLL_BUDGET: - if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; } else { if (val < 0 || val > U16_MAX) @@ -1370,13 +1446,12 @@ set_sndbuf: case SO_ZEROCOPY: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { - if (!((sk->sk_type == SOCK_STREAM && - sk->sk_protocol == IPPROTO_TCP) || + if (!(sk_is_tcp(sk) || (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP))) - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; } else if (sk->sk_family != PF_RDS) { - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; } if (!ret) { if (val < 0 || val > 1) @@ -1402,7 +1477,7 @@ set_sndbuf: * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1444,13 +1519,29 @@ set_sndbuf: break; } + case SO_TXREHASH: + if (val < -1 || val > 1) { + ret = -EINVAL; + break; + } + /* Paired with READ_ONCE() in tcp_rtx_synack() */ + WRITE_ONCE(sk->sk_txrehash, (u8)val); + break; + default: ret = -ENOPROTOOPT; break; } - release_sock(sk); + sockopt_release_sock(sk); return ret; } + +int sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + return sk_setsockopt(sock->sk, level, optname, + optval, optlen); +} EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) @@ -1477,22 +1568,25 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred, } } -static int groups_to_user(gid_t __user *dst, const struct group_info *src) +static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; - for (i = 0; i < src->ngroups; i++) - if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) + for (i = 0; i < src->ngroups; i++) { + gid_t gid = from_kgid_munged(user_ns, src->gid[i]); + + if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; + } return 0; } -int sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) { - struct sock *sk = sock->sk; + struct socket *sock = sk->sk_socket; union { int val; @@ -1509,7 +1603,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, int lv = sizeof(int); int len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; @@ -1644,7 +1738,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); - if (copy_to_user(optval, &peercred, len)) + if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } @@ -1662,11 +1756,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); - return put_user(len, optlen) ? -EFAULT : -ERANGE; + return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, cred->group_info); + ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; @@ -1682,7 +1776,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_user(optval, address, len)) + if (copy_to_sockptr(optval, address, len)) return -EFAULT; goto lenout; } @@ -1699,12 +1793,16 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_PEERSEC: - return security_socket_getpeersec_stream(sock, optval, optlen, len); + return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len); case SO_MARK: v.val = sk->sk_mark; break; + case SO_RCVMARK: + v.val = sock_flag(sk, SOCK_RCVMARK); + break; + case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; @@ -1727,7 +1825,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: - len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); + len = sk_get_filter(sk, optval, len); if (len < 0) return len; @@ -1775,7 +1873,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); - if (copy_to_user(optval, &meminfo, len)) + if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; @@ -1813,7 +1911,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_BINDTOIFINDEX: - v.val = sk->sk_bound_dev_if; + v.val = READ_ONCE(sk->sk_bound_dev_if); break; case SO_NETNS_COOKIE: @@ -1831,6 +1929,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_reserved_mem; break; + case SO_TXREHASH: + v.val = sk->sk_txrehash; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1840,14 +1942,22 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len > lv) len = lv; - if (copy_to_user(optval, &v, len)) + if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return sk_getsockopt(sock->sk, level, optname, + USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); +} + /* * Initialize an sk_lock. * @@ -1982,7 +2092,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { - get_net(net); + get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } @@ -2038,7 +2148,7 @@ static void __sk_destruct(struct rcu_head *head) put_pid(sk->sk_peer_pid); if (likely(sk->sk_net_refcnt)) - put_net(sock_net(sk)); + put_net_track(sock_net(sk), &sk->ns_tracker); sk_prot_free(sk->sk_prot_creator, sk); } @@ -2125,7 +2235,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* SANITY */ if (likely(newsk->sk_net_refcnt)) { - get_net(sock_net(newsk)); + get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); sock_inuse_add(sock_net(newsk), 1); } sk_node_init(&newsk->sk_node); @@ -2241,22 +2351,42 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +static void sk_trim_gso_size(struct sock *sk) +{ + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) + return; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + sk_is_tcp(sk) && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return; +#endif + sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; +} + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; sk_dst_set(sk, dst); - sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; + sk->sk_route_caps = dst->dev->features; + if (sk_is_tcp(sk)) + sk->sk_route_caps |= NETIF_F_GSO; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; - sk->sk_route_caps &= ~sk->sk_route_nocaps; + if (unlikely(sk->sk_gso_disabled)) + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; if (sk_can_gso(sk)) { if (dst->header_len && !xfrm_dst_offload_ok(dst)) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = dst->dev->gso_max_size; - max_segs = max_t(u32, dst->dev->gso_max_segs, 1); + /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ + sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); + sk_trim_gso_size(sk); + sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); + /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ + max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; @@ -2275,8 +2405,20 @@ void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; unsigned int len = skb->truesize; + bool free; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { + if (sock_flag(sk, SOCK_RCU_FREE) && + sk->sk_write_space == sock_def_write_space) { + rcu_read_lock(); + free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); + sock_def_write_space_wfree(sk); + rcu_read_unlock(); + if (unlikely(free)) + __sk_free(sk); + return; + } + /* * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call @@ -2448,7 +2590,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - sysctl_optmem_max) + READ_ONCE(sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -2466,8 +2608,10 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - if ((unsigned int)size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + int optmem_max = READ_ONCE(sysctl_optmem_max); + + if ((unsigned int)size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. @@ -2586,13 +2730,6 @@ failure: } EXPORT_SYMBOL(sock_alloc_send_pskb); -struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, - int noblock, int *errcode) -{ - return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); -} -EXPORT_SYMBOL(sock_alloc_send_skb); - int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { @@ -2600,7 +2737,8 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, switch (cmsg->cmsg_type) { case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; @@ -2763,7 +2901,7 @@ void __release_sock(struct sock *sk) do { next = skb->next; prefetch(next); - WARN_ON_ONCE(skb_dst_is_noref(skb)); + DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); @@ -2788,6 +2926,7 @@ void __sk_flush_backlog(struct sock *sk) __release_sock(sk); spin_unlock_bh(&sk->sk_lock.slock); } +EXPORT_SYMBOL_GPL(__sk_flush_backlog); /** * sk_wait_data - wait for data to arrive at sk_receive_queue @@ -2825,11 +2964,13 @@ EXPORT_SYMBOL(sk_wait_data); */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { - struct proto *prot = sk->sk_prot; - long allocated = sk_memory_allocated_add(sk, amt); bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; + struct proto *prot = sk->sk_prot; bool charged = true; + long allocated; + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); if (memcg_charge && !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge()))) @@ -2906,7 +3047,6 @@ suppress_allocation: return 0; } -EXPORT_SYMBOL(__sk_mem_raise_allocated); /** * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated @@ -2922,10 +3062,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) { int ret, amt = sk_mem_pages(size); - sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; + sk->sk_forward_alloc += amt << PAGE_SHIFT; ret = __sk_mem_raise_allocated(sk, size, amt, kind); if (!ret) - sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; + sk->sk_forward_alloc -= amt << PAGE_SHIFT; return ret; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -2948,17 +3088,16 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } -EXPORT_SYMBOL(__sk_mem_reduce_allocated); /** * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated * @sk: socket - * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) + * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) */ void __sk_mem_reclaim(struct sock *sk, int amount) { - amount >>= SK_MEM_QUANTUM_SHIFT; - sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + amount >>= PAGE_SHIFT; + sk->sk_forward_alloc -= amount << PAGE_SHIFT; __sk_mem_reduce_allocated(sk, amount); } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -3148,20 +3287,42 @@ static void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) { + if (sock_writeable(sk)) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); /* Should agree with poll, otherwise some programs break */ - if (sock_writeable(sk)) - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } +/* An optimised version of sock_def_write_space(), should only be called + * for SOCK_RCU_FREE sockets under RCU read section and after putting + * ->sk_wmem_alloc. + */ +static void sock_def_write_space_wfree(struct sock *sk) +{ + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if (sock_writeable(sk)) { + struct socket_wq *wq = rcu_dereference(sk->sk_wq); + + /* rely on refcount_sub from sock_wfree() */ + smp_mb__after_atomic(); + if (wq && waitqueue_active(&wq->wait)) + wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); + + /* Should agree with poll, otherwise some programs break */ + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + } +} + static void sock_def_destruct(struct sock *sk) { } @@ -3204,8 +3365,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; - sk->sk_rcvbuf = sysctl_rmem_default; - sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); + sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); @@ -3260,13 +3421,14 @@ void sock_init_data(struct socket *sock, struct sock *sk) #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_busy_read; + sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; sk->sk_pacing_rate = ~0UL; WRITE_ONCE(sk->sk_pacing_shift, 10); sk->sk_incoming_cpu = -1; + sk->sk_txrehash = SOCK_TXREHASH_DEFAULT; sk_rx_queue_clear(sk); /* @@ -3286,7 +3448,7 @@ void lock_sock_nested(struct sock *sk, int subclass) might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (sk->sk_lock.owned) + if (sock_owned_by_user_nocheck(sk)) __lock_sock(sk); sk->sk_lock.owned = 1; spin_unlock_bh(&sk->sk_lock.slock); @@ -3317,7 +3479,7 @@ bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (!sk->sk_lock.owned) { + if (!sock_owned_by_user_nocheck(sk)) { /* * Fast path return with bottom halves disabled and * sock::sk_lock.slock held. @@ -3448,7 +3610,8 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); @@ -3459,8 +3622,7 @@ int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int addr_len = 0; int err; - err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, - flags & ~MSG_DONTWAIT, &addr_len); + err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; @@ -3475,7 +3637,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); @@ -3532,19 +3695,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) } #ifdef CONFIG_PROC_FS -#define PROTO_INUSE_NR 64 /* should be enough for the first time */ -struct prot_inuse { - int val[PROTO_INUSE_NR]; -}; - static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); -void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) -{ - __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); -} -EXPORT_SYMBOL_GPL(sock_prot_inuse_add); - int sock_prot_inuse_get(struct net *net, struct proto *prot) { int cpu, idx = prot->inuse_idx; @@ -3557,17 +3709,12 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); -static void sock_inuse_add(struct net *net, int val) -{ - this_cpu_add(*net->core.sock_inuse, val); -} - int sock_inuse_get(struct net *net) { int cpu, res = 0; for_each_possible_cpu(cpu) - res += *per_cpu_ptr(net->core.sock_inuse, cpu); + res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; return res; } @@ -3579,22 +3726,12 @@ static int __net_init sock_inuse_init_net(struct net *net) net->core.prot_inuse = alloc_percpu(struct prot_inuse); if (net->core.prot_inuse == NULL) return -ENOMEM; - - net->core.sock_inuse = alloc_percpu(int); - if (net->core.sock_inuse == NULL) - goto out; - return 0; - -out: - free_percpu(net->core.prot_inuse); - return -ENOMEM; } static void __net_exit sock_inuse_exit_net(struct net *net) { free_percpu(net->core.prot_inuse); - free_percpu(net->core.sock_inuse); } static struct pernet_operations net_inuse_ops = { @@ -3640,9 +3777,6 @@ static inline void release_proto_idx(struct proto *prot) { } -static void sock_inuse_add(struct net *net, int val) -{ -} #endif static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) @@ -3720,6 +3854,14 @@ int proto_register(struct proto *prot, int alloc_slab) { int ret = -ENOBUFS; + if (prot->memory_allocated && !prot->sysctl_mem) { + pr_err("%s: missing sysctl_mem\n", prot->name); + return -EINVAL; + } + if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { + pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); + return -EINVAL; + } if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index c9c45b935f99..f7cf74cdd3db 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -1,5 +1,6 @@ /* License: GPL */ +#include <linux/filter.h> #include <linux/mutex.h> #include <linux/socket.h> #include <linux/skbuff.h> diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4ca4b11f4e5f..81beb16ab1eb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -41,7 +41,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT); + stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); @@ -52,7 +52,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { - kfree(stab); + bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } @@ -292,15 +292,23 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) if (skb_verdict) psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + /* msg_* and stream_* programs references tracked in psock after this + * point. Reference dec and cleanup will occur through psock destructor + */ ret = sock_map_init_proto(sk, psock); - if (ret < 0) - goto out_drop; + if (ret < 0) { + sk_psock_put(sk, psock); + goto out; + } write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); - if (ret) - goto out_unlock_drop; + if (ret) { + write_unlock_bh(&sk->sk_callback_lock); + sk_psock_put(sk, psock); + goto out; + } sk_psock_start_strp(sk, psock); } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { sk_psock_start_verdict(sk,psock); @@ -309,10 +317,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } write_unlock_bh(&sk->sk_callback_lock); return 0; -out_unlock_drop: - write_unlock_bh(&sk->sk_callback_lock); -out_drop: - sk_psock_put(sk, psock); out_progs: if (skb_verdict) bpf_prog_put(skb_verdict); @@ -325,6 +329,7 @@ out_put_stream_parser: out_put_stream_verdict: if (stream_verdict) bpf_prog_put(stream_verdict); +out: return ret; } @@ -356,7 +361,7 @@ static void sock_map_free(struct bpf_map *map) synchronize_rcu(); bpf_map_area_free(stab->sks); - kfree(stab); + bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) @@ -778,17 +783,26 @@ static int sock_map_init_seq_private(void *priv_data, { struct sock_map_seq_info *info = priv_data; + bpf_map_inc_with_uref(aux->map); info->map = aux->map; return 0; } +static void sock_map_fini_seq_private(void *priv_data) +{ + struct sock_map_seq_info *info = priv_data; + + bpf_map_put_with_uref(info->map); +} + static const struct bpf_iter_seq_info sock_map_iter_seq_info = { .seq_ops = &sock_map_seq_ops, .init_seq_private = sock_map_init_seq_private, + .fini_seq_private = sock_map_fini_seq_private, .seq_priv_size = sizeof(struct sock_map_seq_info), }; -static int sock_map_btf_id; +BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab) const struct bpf_map_ops sock_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_map_alloc, @@ -800,8 +814,7 @@ const struct bpf_map_ops sock_map_ops = { .map_lookup_elem = sock_map_lookup, .map_release_uref = sock_map_release_progs, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_stab", - .map_btf_id = &sock_map_btf_id, + .map_btf_id = &sock_map_btf_ids[0], .iter_seq_info = &sock_map_iter_seq_info, }; @@ -1072,7 +1085,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); + htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); @@ -1102,7 +1115,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) return &htab->map; free_htab: - kfree(htab); + bpf_map_area_free(htab); return ERR_PTR(err); } @@ -1155,7 +1168,7 @@ static void sock_hash_free(struct bpf_map *map) synchronize_rcu(); bpf_map_area_free(htab->buckets); - kfree(htab); + bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) @@ -1365,22 +1378,31 @@ static const struct seq_operations sock_hash_seq_ops = { }; static int sock_hash_init_seq_private(void *priv_data, - struct bpf_iter_aux_info *aux) + struct bpf_iter_aux_info *aux) { struct sock_hash_seq_info *info = priv_data; + bpf_map_inc_with_uref(aux->map); info->map = aux->map; info->htab = container_of(aux->map, struct bpf_shtab, map); return 0; } +static void sock_hash_fini_seq_private(void *priv_data) +{ + struct sock_hash_seq_info *info = priv_data; + + bpf_map_put_with_uref(info->map); +} + static const struct bpf_iter_seq_info sock_hash_iter_seq_info = { .seq_ops = &sock_hash_seq_ops, .init_seq_private = sock_hash_init_seq_private, + .fini_seq_private = sock_hash_fini_seq_private, .seq_priv_size = sizeof(struct sock_hash_seq_info), }; -static int sock_hash_map_btf_id; +BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab) const struct bpf_map_ops sock_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = sock_hash_alloc, @@ -1392,8 +1414,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_lookup_elem_sys_only = sock_hash_lookup_sys, .map_release_uref = sock_hash_release_progs, .map_check_btf = map_check_no_btf, - .map_btf_name = "bpf_shtab", - .map_btf_id = &sock_hash_map_btf_id, + .map_btf_id = &sock_hash_map_btf_ids[0], .iter_seq_info = &sock_hash_iter_seq_info, }; @@ -1411,38 +1432,50 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); - struct bpf_prog **pprog; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - pprog = &progs->msg_parser; + *pprog = &progs->msg_parser; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->stream_parser; + *pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - pprog = &progs->stream_verdict; + *pprog = &progs->stream_verdict; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - pprog = &progs->skb_verdict; + *pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } + return 0; +} + +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) +{ + struct bpf_prog **pprog; + int ret; + + ret = sock_map_prog_lookup(map, &pprog, which); + if (ret) + return ret; + if (old) return psock_replace_prog(pprog, prog, old); @@ -1450,6 +1483,57 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, return 0; } +int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; + struct bpf_prog **pprog; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + u32 id = 0; + int ret; + + if (attr->query.query_flags) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + rcu_read_lock(); + + ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + if (ret) + goto end; + + prog = *pprog; + prog_cnt = !prog ? 0 : 1; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + goto end; + + /* we do not hold the refcnt, the bpf prog may be released + * asynchronously and the id would be set to 0. + */ + id = data_race(prog->aux->id); + if (id == 0) + prog_cnt = 0; + +end: + rcu_read_unlock(); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || + (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || + copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + ret = -EFAULT; + + fdput(f); + return ret; +} + static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { @@ -1495,6 +1579,29 @@ void sock_map_unhash(struct sock *sk) } EXPORT_SYMBOL_GPL(sock_map_unhash); +void sock_map_destroy(struct sock *sk) +{ + void (*saved_destroy)(struct sock *sk); + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock_get(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->destroy) + sk->sk_prot->destroy(sk); + return; + } + + saved_destroy = psock->saved_destroy; + sock_map_remove_links(sk, psock); + rcu_read_unlock(); + sk_psock_stop(psock); + sk_psock_put(sk, psock); + saved_destroy(sk); +} +EXPORT_SYMBOL_GPL(sock_map_destroy); + void sock_map_close(struct sock *sk, long timeout) { void (*saved_close)(struct sock *sk, long timeout); @@ -1512,9 +1619,10 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); - sk_psock_stop(psock, true); - sk_psock_put(sk, psock); + sk_psock_stop(psock); release_sock(sk); + cancel_work_sync(&psock->work); + sk_psock_put(sk, psock); saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); @@ -1564,7 +1672,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 3f00a28fe762..fb90e1e00773 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -21,6 +21,22 @@ static DEFINE_IDA(reuseport_ida); static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, struct sock_reuseport *reuse, bool bind_inany); +void reuseport_has_conns_set(struct sock *sk) +{ + struct sock_reuseport *reuse; + + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (likely(reuse)) + reuse->has_conns = 1; + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_has_conns_set); + static int reuseport_sock_index(struct sock *sk, const struct sock_reuseport *reuse, bool closed) @@ -387,7 +403,7 @@ void reuseport_stop_listen_sock(struct sock *sk) prog = rcu_dereference_protected(reuse->prog, lockdep_is_held(&reuseport_lock)); - if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req || + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) || (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) { /* Migration capable, move sk from the listening section * to the closed section. @@ -545,7 +561,7 @@ struct sock *reuseport_migrate_sock(struct sock *sk, hash = migrating_sk->sk_hash; prog = rcu_dereference(reuse->prog); if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) { - if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req)) goto select_by_hash; goto failure; } diff --git a/net/core/stream.c b/net/core/stream.c index 06b36c730ce8..75fded8495f5 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -123,7 +123,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) - current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; + current_timeo = vm_wait = prandom_u32_max(HZ / 5) + 2; add_wait_queue(sk_sleep(sk), &wait); @@ -159,7 +159,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - remove_wait_queue(sk_sleep(sk), &wait); + if (!sock_flag(sk, SOCK_DEAD)) + remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: @@ -196,13 +197,13 @@ void sk_stream_kill_queues(struct sock *sk) __skb_queue_purge(&sk->sk_receive_queue); /* Next, the write queue. */ - WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); + WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue)); /* Account for returned memory. */ sk_mem_reclaim_final(sk); - WARN_ON(sk->sk_wmem_queued); - WARN_ON(sk->sk_forward_alloc); + WARN_ON_ONCE(sk->sk_wmem_queued); + WARN_ON_ONCE(sk->sk_forward_alloc); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5f88526ad61c..5b1ce656baa1 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -6,6 +6,7 @@ * Added /proc/sys/net/core directory entry (empty =) ). [MS] */ +#include <linux/filter.h> #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/module.h> @@ -22,14 +23,12 @@ #include <net/busy_poll.h> #include <net/pkt_sched.h> -static int two = 2; -static int three = 3; +#include "dev.h" + static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; -static long long_one __maybe_unused = 1; -static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -102,8 +101,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - synchronize_rcu(); - vfree(orig_sock_table); + kvfree_rcu(orig_sock_table); } } } @@ -141,8 +139,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - synchronize_rcu(); - kfree(cur); + kfree_rcu(cur); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); @@ -236,14 +233,17 @@ static int set_default_qdisc(struct ctl_table *table, int write, static int proc_do_dev_weight(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret; + static DEFINE_MUTEX(dev_weight_mutex); + int ret, weight; + mutex_lock(&dev_weight_mutex); ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret != 0) - return ret; - - dev_rx_weight = weight_p * dev_weight_rx_bias; - dev_tx_weight = weight_p * dev_weight_tx_bias; + if (!ret && write) { + weight = READ_ONCE(weight_p); + WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + } + mutex_unlock(&dev_weight_mutex); return ret; } @@ -266,6 +266,8 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; + int min = *(int *)table->extra1; + int max = *(int *)table->extra2; struct ctl_table tmp = *table; if (write && !capable(CAP_SYS_ADMIN)) @@ -283,6 +285,10 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, ret = -EPERM; } } + + if (write && ret && min == max) + pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n"); + return ret; } @@ -389,7 +395,7 @@ static struct ctl_table net_core_table[] = { .extra2 = SYSCTL_ONE, # else .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, # endif }, # ifdef CONFIG_HAVE_EBPF_JIT @@ -400,7 +406,7 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "bpf_jit_kallsyms", @@ -418,7 +424,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(long), .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, - .extra1 = &long_one, + .extra1 = SYSCTL_LONG_ONE, .extra2 = &bpf_jit_limit_max, }, #endif @@ -545,7 +551,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "devconf_inherit_init_net", @@ -554,7 +560,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &three, + .extra2 = SYSCTL_THREE, }, { .procname = "high_order_alloc_disable", @@ -580,6 +586,14 @@ static struct ctl_table net_core_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, + { + .procname = "skb_defer_max", + .data = &sysctl_skb_defer_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, { } }; @@ -592,6 +606,15 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, + { + .procname = "txrehash", + .data = &init_net.core.sysctl_txrehash, + .maxlen = sizeof(u8), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + .proc_handler = proc_dou8vec_minmax, + }, { } }; @@ -610,7 +633,7 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl; + struct ctl_table *tbl, *tmp; tbl = netns_core_table; if (!net_eq(net, &init_net)) { @@ -618,7 +641,8 @@ static __net_init int sysctl_core_net_init(struct net *net) if (tbl == NULL) goto err_dup; - tbl[0].data = &net->core.sysctl_somaxconn; + for (tmp = tbl; tmp->procname; tmp++) + tmp->data += (char *)net - (char *)&init_net; /* Don't export any sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { diff --git a/net/core/utils.c b/net/core/utils.c index 1f31a39236d5..938495bc1d34 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -476,9 +476,9 @@ void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { - *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); + csum_replace_by_diff(sum, diff); if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) - skb->csum = ~csum_add(diff, ~skb->csum); + skb->csum = ~csum_sub(diff, skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); } diff --git a/net/core/xdp.c b/net/core/xdp.c index 5ddc29f29bad..844c9d99dc0e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -110,20 +110,15 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +void xdp_unreg_mem_model(struct xdp_mem_info *mem) { struct xdp_mem_allocator *xa; - int type = xdp_rxq->mem.type; - int id = xdp_rxq->mem.id; + int type = mem->type; + int id = mem->id; /* Reset mem info to defaults */ - xdp_rxq->mem.id = 0; - xdp_rxq->mem.type = 0; - - if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { - WARN(1, "Missing register, driver bug"); - return; - } + mem->id = 0; + mem->type = 0; if (id == 0) return; @@ -135,6 +130,17 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) rcu_read_unlock(); } } +EXPORT_SYMBOL_GPL(xdp_unreg_mem_model); + +void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return; + } + + xdp_unreg_mem_model(&xdp_rxq->mem); +} EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) @@ -156,9 +162,15 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) } /* Returns 0 on success, negative on failure */ -int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index, unsigned int napi_id) +int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id, u32 frag_size) { + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { WARN(1, "Driver promised not to register this"); return -EINVAL; @@ -169,21 +181,17 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_unreg(xdp_rxq); } - if (!dev) { - WARN(1, "Missing net_device from driver"); - return -ENODEV; - } - /* State either UNREGISTERED or NEW */ xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; xdp_rxq->napi_id = napi_id; + xdp_rxq->frag_size = frag_size; xdp_rxq->reg_state = REG_STATE_REGISTERED; return 0; } -EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); +EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) { @@ -259,28 +267,24 @@ static bool __is_supported_mem_type(enum xdp_mem_type type) return true; } -int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, - enum xdp_mem_type type, void *allocator) +static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, + void *allocator) { struct xdp_mem_allocator *xdp_alloc; gfp_t gfp = GFP_KERNEL; int id, errno, ret; void *ptr; - if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { - WARN(1, "Missing register, driver bug"); - return -EFAULT; - } - if (!__is_supported_mem_type(type)) - return -EOPNOTSUPP; + return ERR_PTR(-EOPNOTSUPP); - xdp_rxq->mem.type = type; + mem->type = type; if (!allocator) { if (type == MEM_TYPE_PAGE_POOL) - return -EINVAL; /* Setup time check page_pool req */ - return 0; + return ERR_PTR(-EINVAL); /* Setup time check page_pool req */ + return NULL; } /* Delay init of rhashtable to save memory if feature isn't used */ @@ -290,13 +294,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, mutex_unlock(&mem_id_lock); if (ret < 0) { WARN_ON(1); - return ret; + return ERR_PTR(ret); } } xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); if (!xdp_alloc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); mutex_lock(&mem_id_lock); id = __mem_id_cyclic_get(gfp); @@ -304,31 +308,62 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, errno = id; goto err; } - xdp_rxq->mem.id = id; - xdp_alloc->mem = xdp_rxq->mem; + mem->id = id; + xdp_alloc->mem = *mem; xdp_alloc->allocator = allocator; /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { - ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id); - xdp_rxq->mem.id = 0; + ida_simple_remove(&mem_id_pool, mem->id); + mem->id = 0; errno = PTR_ERR(ptr); goto err; } if (type == MEM_TYPE_PAGE_POOL) - page_pool_use_xdp_mem(allocator, mem_allocator_disconnect); + page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem); mutex_unlock(&mem_id_lock); - trace_mem_connect(xdp_alloc, xdp_rxq); - return 0; + return xdp_alloc; err: mutex_unlock(&mem_id_lock); kfree(xdp_alloc); - return errno; + return ERR_PTR(errno); } + +int xdp_reg_mem_model(struct xdp_mem_info *mem, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + xdp_alloc = __xdp_reg_mem_model(mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + return 0; +} +EXPORT_SYMBOL_GPL(xdp_reg_mem_model); + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + + xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator); + if (IS_ERR(xdp_alloc)) + return PTR_ERR(xdp_alloc); + + if (trace_mem_connect_enabled() && xdp_alloc) + trace_mem_connect(xdp_alloc, xdp_rxq); + return 0; +} + EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); /* XDP RX runs under NAPI protection, and in different delivery error @@ -337,22 +372,20 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp) +void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + struct xdp_buff *xdp) { - struct xdp_mem_allocator *xa; struct page *page; switch (mem->type) { case MEM_TYPE_PAGE_POOL: - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - page_pool_put_full_page(xa->page_pool, page, napi_direct); - rcu_read_unlock(); + /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) + * as mem->type knows this a page_pool page + */ + page_pool_put_full_page(page->pp, page, napi_direct); break; case MEM_TYPE_PAGE_SHARED: page_frag_free(data); @@ -374,12 +407,38 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { + struct skb_shared_info *sinfo; + int i; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_return(page_address(page), &xdpf->mem, false, NULL); + } +out: __xdp_return(xdpf->data, &xdpf->mem, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { + struct skb_shared_info *sinfo; + int i; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_return(page_address(page), &xdpf->mem, true, NULL); + } +out: __xdp_return(xdpf->data, &xdpf->mem, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -415,7 +474,7 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_mem_allocator *xa; if (mem->type != MEM_TYPE_PAGE_POOL) { - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + xdp_return_frame(xdpf); return; } @@ -434,14 +493,41 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); } + if (unlikely(xdp_frame_has_frags(xdpf))) { + struct skb_shared_info *sinfo; + int i; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + + bq->q[bq->count++] = skb_frag_address(frag); + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + } + } bq->q[bq->count++] = xdpf->data; } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); void xdp_return_buff(struct xdp_buff *xdp) { + struct skb_shared_info *sinfo; + int i; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp); + } +out: __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } +EXPORT_SYMBOL_GPL(xdp_return_buff); /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) @@ -529,8 +615,14 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev) { + struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf); unsigned int headroom, frame_size; void *hard_start; + u8 nr_frags; + + /* xdp frags frame */ + if (unlikely(xdp_frame_has_frags(xdpf))) + nr_frags = sinfo->nr_frags; /* Part of headroom was reserved to xdpf */ headroom = sizeof(*xdpf) + xdpf->headroom; @@ -550,6 +642,12 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, if (xdpf->metasize) skb_metadata_set(skb, xdpf->metasize); + if (unlikely(xdp_frame_has_frags(xdpf))) + xdp_update_skb_shared_info(skb, nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_is_frag_pfmemalloc(xdpf)); + /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); |