aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile13
-rw-r--r--net/core/bpf_sk_storage.c1350
-rw-r--r--net/core/datagram.c116
-rw-r--r--net/core/datagram.h15
-rw-r--r--net/core/dev.c4435
-rw-r--r--net/core/dev.h112
-rw-r--r--net/core/dev_addr_lists.c203
-rw-r--r--net/core/dev_addr_lists_test.c236
-rw-r--r--net/core/dev_ioctl.c320
-rw-r--r--net/core/devlink.c7198
-rw-r--r--net/core/drop_monitor.c333
-rw-r--r--net/core/dst.c79
-rw-r--r--net/core/dst_cache.c19
-rw-r--r--net/core/failover.c4
-rw-r--r--net/core/fib_rules.c66
-rw-r--r--net/core/filter.c4073
-rw-r--r--net/core/flow_dissector.c467
-rw-r--r--net/core/flow_offload.c448
-rw-r--r--net/core/gen_estimator.c59
-rw-r--r--net/core/gen_stats.c188
-rw-r--r--net/core/gro.c805
-rw-r--r--net/core/gro_cells.c42
-rw-r--r--net/core/link_watch.c37
-rw-r--r--net/core/lwt_bpf.c22
-rw-r--r--net/core/lwtunnel.c16
-rw-r--r--net/core/neighbour.c433
-rw-r--r--net/core/net-procfs.c58
-rw-r--r--net/core/net-sysfs.c571
-rw-r--r--net/core/net-sysfs.h2
-rw-r--r--net/core/net-traces.c1
-rw-r--r--net/core/net_namespace.c145
-rw-r--r--net/core/netclassid_cgroup.c16
-rw-r--r--net/core/netevent.c2
-rw-r--r--net/core/netpoll.c58
-rw-r--r--net/core/netprio_cgroup.c11
-rw-r--r--net/core/of_net.c170
-rw-r--r--net/core/page_pool.c571
-rw-r--r--net/core/pktgen.c289
-rw-r--r--net/core/ptp_classifier.c44
-rw-r--r--net/core/rtnetlink.c1393
-rw-r--r--net/core/scm.c128
-rw-r--r--net/core/secure_seq.c25
-rw-r--r--net/core/selftests.c412
-rw-r--r--net/core/skbuff.c1470
-rw-r--r--net/core/skmsg.c731
-rw-r--r--net/core/sock.c1488
-rw-r--r--net/core/sock_destructor.h12
-rw-r--r--net/core/sock_diag.c10
-rw-r--r--net/core/sock_map.c1056
-rw-r--r--net/core/sock_reuseport.c419
-rw-r--r--net/core/stream.c16
-rw-r--r--net/core/sysctl_net_core.c124
-rw-r--r--net/core/tso.c44
-rw-r--r--net/core/utils.c4
-rw-r--r--net/core/xdp.c421
55 files changed, 22664 insertions, 8116 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 3e2c378e5f31..5857cec87b83 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -4,34 +4,39 @@
#
obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
- gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \
+ flow_dissector.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
- fib_notifier.o xdp.o flow_offload.o
+ fib_notifier.o xdp.o flow_offload.o gro.o
+
+obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
-obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NET_SELFTESTS) += selftests.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
-obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
obj-$(CONFIG_FAILOVER) += failover.o
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
+obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
+obj-$(CONFIG_OF) += of_net.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 3ab23f698221..94374d529ea4 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -6,507 +6,41 @@
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
#include <net/bpf_sk_storage.h>
#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
-static atomic_t cache_idx;
+DEFINE_BPF_STORAGE_CACHE(sk_cache);
-#define SK_STORAGE_CREATE_FLAG_MASK \
- (BPF_F_NO_PREALLOC | BPF_F_CLONE)
-
-struct bucket {
- struct hlist_head list;
- raw_spinlock_t lock;
-};
-
-/* Thp map is not the primary owner of a bpf_sk_storage_elem.
- * Instead, the sk->sk_bpf_storage is.
- *
- * The map (bpf_sk_storage_map) is for two purposes
- * 1. Define the size of the "sk local storage". It is
- * the map's value_size.
- *
- * 2. Maintain a list to keep track of all elems such
- * that they can be cleaned up during the map destruction.
- *
- * When a bpf local storage is being looked up for a
- * particular sk, the "bpf_map" pointer is actually used
- * as the "key" to search in the list of elem in
- * sk->sk_bpf_storage.
- *
- * Hence, consider sk->sk_bpf_storage is the mini-map
- * with the "bpf_map" pointer as the searching key.
- */
-struct bpf_sk_storage_map {
- struct bpf_map map;
- /* Lookup elem does not require accessing the map.
- *
- * Updating/Deleting requires a bucket lock to
- * link/unlink the elem from the map. Having
- * multiple buckets to improve contention.
- */
- struct bucket *buckets;
- u32 bucket_log;
- u16 elem_size;
- u16 cache_idx;
-};
-
-struct bpf_sk_storage_data {
- /* smap is used as the searching key when looking up
- * from sk->sk_bpf_storage.
- *
- * Put it in the same cacheline as the data to minimize
- * the number of cachelines access during the cache hit case.
- */
- struct bpf_sk_storage_map __rcu *smap;
- u8 data[0] __aligned(8);
-};
-
-/* Linked to bpf_sk_storage and bpf_sk_storage_map */
-struct bpf_sk_storage_elem {
- struct hlist_node map_node; /* Linked to bpf_sk_storage_map */
- struct hlist_node snode; /* Linked to bpf_sk_storage */
- struct bpf_sk_storage __rcu *sk_storage;
- struct rcu_head rcu;
- /* 8 bytes hole */
- /* The data is stored in aother cacheline to minimize
- * the number of cachelines access during a cache hit.
- */
- struct bpf_sk_storage_data sdata ____cacheline_aligned;
-};
-
-#define SELEM(_SDATA) container_of((_SDATA), struct bpf_sk_storage_elem, sdata)
-#define SDATA(_SELEM) (&(_SELEM)->sdata)
-#define BPF_SK_STORAGE_CACHE_SIZE 16
-
-struct bpf_sk_storage {
- struct bpf_sk_storage_data __rcu *cache[BPF_SK_STORAGE_CACHE_SIZE];
- struct hlist_head list; /* List of bpf_sk_storage_elem */
- struct sock *sk; /* The sk that owns the the above "list" of
- * bpf_sk_storage_elem.
- */
- struct rcu_head rcu;
- raw_spinlock_t lock; /* Protect adding/removing from the "list" */
-};
-
-static struct bucket *select_bucket(struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *selem)
-{
- return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
-}
-
-static int omem_charge(struct sock *sk, unsigned int size)
-{
- /* same check as in sock_kmalloc() */
- if (size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
- atomic_add(size, &sk->sk_omem_alloc);
- return 0;
- }
-
- return -ENOMEM;
-}
-
-static bool selem_linked_to_sk(const struct bpf_sk_storage_elem *selem)
-{
- return !hlist_unhashed(&selem->snode);
-}
-
-static bool selem_linked_to_map(const struct bpf_sk_storage_elem *selem)
-{
- return !hlist_unhashed(&selem->map_node);
-}
-
-static struct bpf_sk_storage_elem *selem_alloc(struct bpf_sk_storage_map *smap,
- struct sock *sk, void *value,
- bool charge_omem)
-{
- struct bpf_sk_storage_elem *selem;
-
- if (charge_omem && omem_charge(sk, smap->elem_size))
- return NULL;
-
- selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN);
- if (selem) {
- if (value)
- memcpy(SDATA(selem)->data, value, smap->map.value_size);
- return selem;
- }
-
- if (charge_omem)
- atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
-
- return NULL;
-}
-
-/* sk_storage->lock must be held and selem->sk_storage == sk_storage.
- * The caller must ensure selem->smap is still valid to be
- * dereferenced for its smap->elem_size and smap->cache_idx.
- */
-static bool __selem_unlink_sk(struct bpf_sk_storage *sk_storage,
- struct bpf_sk_storage_elem *selem,
- bool uncharge_omem)
-{
- struct bpf_sk_storage_map *smap;
- bool free_sk_storage;
- struct sock *sk;
-
- smap = rcu_dereference(SDATA(selem)->smap);
- sk = sk_storage->sk;
-
- /* All uncharging on sk->sk_omem_alloc must be done first.
- * sk may be freed once the last selem is unlinked from sk_storage.
- */
- if (uncharge_omem)
- atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
-
- free_sk_storage = hlist_is_singular_node(&selem->snode,
- &sk_storage->list);
- if (free_sk_storage) {
- atomic_sub(sizeof(struct bpf_sk_storage), &sk->sk_omem_alloc);
- sk_storage->sk = NULL;
- /* After this RCU_INIT, sk may be freed and cannot be used */
- RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
-
- /* sk_storage is not freed now. sk_storage->lock is
- * still held and raw_spin_unlock_bh(&sk_storage->lock)
- * will be done by the caller.
- *
- * Although the unlock will be done under
- * rcu_read_lock(), it is more intutivie to
- * read if kfree_rcu(sk_storage, rcu) is done
- * after the raw_spin_unlock_bh(&sk_storage->lock).
- *
- * Hence, a "bool free_sk_storage" is returned
- * to the caller which then calls the kfree_rcu()
- * after unlock.
- */
- }
- hlist_del_init_rcu(&selem->snode);
- if (rcu_access_pointer(sk_storage->cache[smap->cache_idx]) ==
- SDATA(selem))
- RCU_INIT_POINTER(sk_storage->cache[smap->cache_idx], NULL);
-
- kfree_rcu(selem, rcu);
-
- return free_sk_storage;
-}
-
-static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
+static struct bpf_local_storage_data *
+bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
{
- struct bpf_sk_storage *sk_storage;
- bool free_sk_storage = false;
-
- if (unlikely(!selem_linked_to_sk(selem)))
- /* selem has already been unlinked from sk */
- return;
-
- sk_storage = rcu_dereference(selem->sk_storage);
- raw_spin_lock_bh(&sk_storage->lock);
- if (likely(selem_linked_to_sk(selem)))
- free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
- raw_spin_unlock_bh(&sk_storage->lock);
-
- if (free_sk_storage)
- kfree_rcu(sk_storage, rcu);
-}
-
-static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
- struct bpf_sk_storage_elem *selem)
-{
- RCU_INIT_POINTER(selem->sk_storage, sk_storage);
- hlist_add_head(&selem->snode, &sk_storage->list);
-}
-
-static void selem_unlink_map(struct bpf_sk_storage_elem *selem)
-{
- struct bpf_sk_storage_map *smap;
- struct bucket *b;
-
- if (unlikely(!selem_linked_to_map(selem)))
- /* selem has already be unlinked from smap */
- return;
-
- smap = rcu_dereference(SDATA(selem)->smap);
- b = select_bucket(smap, selem);
- raw_spin_lock_bh(&b->lock);
- if (likely(selem_linked_to_map(selem)))
- hlist_del_init_rcu(&selem->map_node);
- raw_spin_unlock_bh(&b->lock);
-}
-
-static void selem_link_map(struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *selem)
-{
- struct bucket *b = select_bucket(smap, selem);
-
- raw_spin_lock_bh(&b->lock);
- RCU_INIT_POINTER(SDATA(selem)->smap, smap);
- hlist_add_head_rcu(&selem->map_node, &b->list);
- raw_spin_unlock_bh(&b->lock);
-}
-
-static void selem_unlink(struct bpf_sk_storage_elem *selem)
-{
- /* Always unlink from map before unlinking from sk_storage
- * because selem will be freed after successfully unlinked from
- * the sk_storage.
- */
- selem_unlink_map(selem);
- selem_unlink_sk(selem);
-}
-
-static struct bpf_sk_storage_data *
-__sk_storage_lookup(struct bpf_sk_storage *sk_storage,
- struct bpf_sk_storage_map *smap,
- bool cacheit_lockit)
-{
- struct bpf_sk_storage_data *sdata;
- struct bpf_sk_storage_elem *selem;
-
- /* Fast path (cache hit) */
- sdata = rcu_dereference(sk_storage->cache[smap->cache_idx]);
- if (sdata && rcu_access_pointer(sdata->smap) == smap)
- return sdata;
-
- /* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &sk_storage->list, snode)
- if (rcu_access_pointer(SDATA(selem)->smap) == smap)
- break;
-
- if (!selem)
- return NULL;
-
- sdata = SDATA(selem);
- if (cacheit_lockit) {
- /* spinlock is needed to avoid racing with the
- * parallel delete. Otherwise, publishing an already
- * deleted sdata to the cache will become a use-after-free
- * problem in the next __sk_storage_lookup().
- */
- raw_spin_lock_bh(&sk_storage->lock);
- if (selem_linked_to_sk(selem))
- rcu_assign_pointer(sk_storage->cache[smap->cache_idx],
- sdata);
- raw_spin_unlock_bh(&sk_storage->lock);
- }
-
- return sdata;
-}
-
-static struct bpf_sk_storage_data *
-sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
-{
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_map *smap;
- sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ sk_storage =
+ rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held());
if (!sk_storage)
return NULL;
- smap = (struct bpf_sk_storage_map *)map;
- return __sk_storage_lookup(sk_storage, smap, cacheit_lockit);
-}
-
-static int check_flags(const struct bpf_sk_storage_data *old_sdata,
- u64 map_flags)
-{
- if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
- /* elem already exists */
- return -EEXIST;
-
- if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
- /* elem doesn't exist, cannot update it */
- return -ENOENT;
-
- return 0;
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit);
}
-static int sk_storage_alloc(struct sock *sk,
- struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *first_selem)
+static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
{
- struct bpf_sk_storage *prev_sk_storage, *sk_storage;
- int err;
-
- err = omem_charge(sk, sizeof(*sk_storage));
- if (err)
- return err;
+ struct bpf_local_storage_data *sdata;
- sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN);
- if (!sk_storage) {
- err = -ENOMEM;
- goto uncharge;
- }
- INIT_HLIST_HEAD(&sk_storage->list);
- raw_spin_lock_init(&sk_storage->lock);
- sk_storage->sk = sk;
-
- __selem_link_sk(sk_storage, first_selem);
- selem_link_map(smap, first_selem);
- /* Publish sk_storage to sk. sk->sk_lock cannot be acquired.
- * Hence, atomic ops is used to set sk->sk_bpf_storage
- * from NULL to the newly allocated sk_storage ptr.
- *
- * From now on, the sk->sk_bpf_storage pointer is protected
- * by the sk_storage->lock. Hence, when freeing
- * the sk->sk_bpf_storage, the sk_storage->lock must
- * be held before setting sk->sk_bpf_storage to NULL.
- */
- prev_sk_storage = cmpxchg((struct bpf_sk_storage **)&sk->sk_bpf_storage,
- NULL, sk_storage);
- if (unlikely(prev_sk_storage)) {
- selem_unlink_map(first_selem);
- err = -EAGAIN;
- goto uncharge;
-
- /* Note that even first_selem was linked to smap's
- * bucket->list, first_selem can be freed immediately
- * (instead of kfree_rcu) because
- * bpf_sk_storage_map_free() does a
- * synchronize_rcu() before walking the bucket->list.
- * Hence, no one is accessing selem from the
- * bucket->list under rcu_read_lock().
- */
- }
-
- return 0;
-
-uncharge:
- kfree(sk_storage);
- atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc);
- return err;
-}
-
-/* sk cannot be going away because it is linking new elem
- * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
- * Otherwise, it will become a leak (and other memory issues
- * during map destruction).
- */
-static struct bpf_sk_storage_data *sk_storage_update(struct sock *sk,
- struct bpf_map *map,
- void *value,
- u64 map_flags)
-{
- struct bpf_sk_storage_data *old_sdata = NULL;
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_map *smap;
- int err;
-
- /* BPF_EXIST and BPF_NOEXIST cannot be both set */
- if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
- /* BPF_F_LOCK can only be used in a value with spin_lock */
- unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
- return ERR_PTR(-EINVAL);
-
- smap = (struct bpf_sk_storage_map *)map;
- sk_storage = rcu_dereference(sk->sk_bpf_storage);
- if (!sk_storage || hlist_empty(&sk_storage->list)) {
- /* Very first elem for this sk */
- err = check_flags(NULL, map_flags);
- if (err)
- return ERR_PTR(err);
-
- selem = selem_alloc(smap, sk, value, true);
- if (!selem)
- return ERR_PTR(-ENOMEM);
-
- err = sk_storage_alloc(sk, smap, selem);
- if (err) {
- kfree(selem);
- atomic_sub(smap->elem_size, &sk->sk_omem_alloc);
- return ERR_PTR(err);
- }
-
- return SDATA(selem);
- }
-
- if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
- /* Hoping to find an old_sdata to do inline update
- * such that it can avoid taking the sk_storage->lock
- * and changing the lists.
- */
- old_sdata = __sk_storage_lookup(sk_storage, smap, false);
- err = check_flags(old_sdata, map_flags);
- if (err)
- return ERR_PTR(err);
- if (old_sdata && selem_linked_to_sk(SELEM(old_sdata))) {
- copy_map_value_locked(map, old_sdata->data,
- value, false);
- return old_sdata;
- }
- }
-
- raw_spin_lock_bh(&sk_storage->lock);
-
- /* Recheck sk_storage->list under sk_storage->lock */
- if (unlikely(hlist_empty(&sk_storage->list))) {
- /* A parallel del is happening and sk_storage is going
- * away. It has just been checked before, so very
- * unlikely. Return instead of retry to keep things
- * simple.
- */
- err = -EAGAIN;
- goto unlock_err;
- }
-
- old_sdata = __sk_storage_lookup(sk_storage, smap, false);
- err = check_flags(old_sdata, map_flags);
- if (err)
- goto unlock_err;
-
- if (old_sdata && (map_flags & BPF_F_LOCK)) {
- copy_map_value_locked(map, old_sdata->data, value, false);
- selem = SELEM(old_sdata);
- goto unlock;
- }
-
- /* sk_storage->lock is held. Hence, we are sure
- * we can unlink and uncharge the old_sdata successfully
- * later. Hence, instead of charging the new selem now
- * and then uncharge the old selem later (which may cause
- * a potential but unnecessary charge failure), avoid taking
- * a charge at all here (the "!old_sdata" check) and the
- * old_sdata will not be uncharged later during __selem_unlink_sk().
- */
- selem = selem_alloc(smap, sk, value, !old_sdata);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
- }
-
- /* First, link the new selem to the map */
- selem_link_map(smap, selem);
-
- /* Second, link (and publish) the new selem to sk_storage */
- __selem_link_sk(sk_storage, selem);
-
- /* Third, remove old selem, SELEM(old_sdata) */
- if (old_sdata) {
- selem_unlink_map(SELEM(old_sdata));
- __selem_unlink_sk(sk_storage, SELEM(old_sdata), false);
- }
-
-unlock:
- raw_spin_unlock_bh(&sk_storage->lock);
- return SDATA(selem);
-
-unlock_err:
- raw_spin_unlock_bh(&sk_storage->lock);
- return ERR_PTR(err);
-}
-
-static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
-{
- struct bpf_sk_storage_data *sdata;
-
- sdata = sk_storage_lookup(sk, map, false);
+ sdata = bpf_sk_storage_lookup(sk, map, false);
if (!sdata)
return -ENOENT;
- selem_unlink(SELEM(sdata));
+ bpf_selem_unlink(SELEM(sdata), true);
return 0;
}
@@ -514,8 +48,8 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
/* Called by __sk_destruct() & bpf_sk_storage_clone() */
void bpf_sk_storage_free(struct sock *sk)
{
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage *sk_storage;
bool free_sk_storage = false;
struct hlist_node *n;
@@ -531,7 +65,7 @@ void bpf_sk_storage_free(struct sock *sk)
* Thus, no elem can be added-to or deleted-from the
* sk_storage->list by the bpf_prog or by the bpf-map's syscall.
*
- * It is racing with bpf_sk_storage_map_free() alone
+ * It is racing with bpf_local_storage_map_free() alone
* when unlinking elem from the sk_storage->list and
* the map's bucket->list.
*/
@@ -540,8 +74,9 @@ void bpf_sk_storage_free(struct sock *sk)
/* Always unlink from map before unlinking from
* sk_storage.
*/
- selem_unlink_map(selem);
- free_sk_storage = __selem_unlink_sk(sk_storage, selem, true);
+ bpf_selem_unlink_map(selem);
+ free_sk_storage = bpf_selem_unlink_storage_nolock(
+ sk_storage, selem, true, false);
}
raw_spin_unlock_bh(&sk_storage->lock);
rcu_read_unlock();
@@ -552,126 +87,22 @@ void bpf_sk_storage_free(struct sock *sk)
static void bpf_sk_storage_map_free(struct bpf_map *map)
{
- struct bpf_sk_storage_elem *selem;
- struct bpf_sk_storage_map *smap;
- struct bucket *b;
- unsigned int i;
+ struct bpf_local_storage_map *smap;
- smap = (struct bpf_sk_storage_map *)map;
-
- /* Note that this map might be concurrently cloned from
- * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
- * RCU read section to finish before proceeding. New RCU
- * read sections should be prevented via bpf_map_inc_not_zero.
- */
- synchronize_rcu();
-
- /* bpf prog and the userspace can no longer access this map
- * now. No new selem (of this map) can be added
- * to the sk->sk_bpf_storage or to the map bucket's list.
- *
- * The elem of this map can be cleaned up here
- * or
- * by bpf_sk_storage_free() during __sk_destruct().
- */
- for (i = 0; i < (1U << smap->bucket_log); i++) {
- b = &smap->buckets[i];
-
- rcu_read_lock();
- /* No one is adding to b->list now */
- while ((selem = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(&b->list)),
- struct bpf_sk_storage_elem,
- map_node))) {
- selem_unlink(selem);
- cond_resched_rcu();
- }
- rcu_read_unlock();
- }
-
- /* bpf_sk_storage_free() may still need to access the map.
- * e.g. bpf_sk_storage_free() has unlinked selem from the map
- * which then made the above while((selem = ...)) loop
- * exited immediately.
- *
- * However, the bpf_sk_storage_free() still needs to access
- * the smap->elem_size to do the uncharging in
- * __selem_unlink_sk().
- *
- * Hence, wait another rcu grace period for the
- * bpf_sk_storage_free() to finish.
- */
- synchronize_rcu();
-
- kvfree(smap->buckets);
- kfree(map);
-}
-
-static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
-{
- if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
- !(attr->map_flags & BPF_F_NO_PREALLOC) ||
- attr->max_entries ||
- attr->key_size != sizeof(int) || !attr->value_size ||
- /* Enforce BTF for userspace sk dumping */
- !attr->btf_key_type_id || !attr->btf_value_type_id)
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (attr->value_size >= KMALLOC_MAX_SIZE -
- MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) ||
- /* U16_MAX is much more than enough for sk local storage
- * considering a tcp_sock is ~2k.
- */
- attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem))
- return -E2BIG;
-
- return 0;
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx);
+ bpf_local_storage_map_free(smap, NULL);
}
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_sk_storage_map *smap;
- unsigned int i;
- u32 nbuckets;
- u64 cost;
- int ret;
-
- smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- bpf_map_init_from_attr(&smap->map, attr);
-
- nbuckets = roundup_pow_of_two(num_possible_cpus());
- /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- nbuckets = max_t(u32, 2, nbuckets);
- smap->bucket_log = ilog2(nbuckets);
- cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
-
- ret = bpf_map_charge_init(&smap->map.memory, cost);
- if (ret < 0) {
- kfree(smap);
- return ERR_PTR(ret);
- }
+ struct bpf_local_storage_map *smap;
- smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
- GFP_USER | __GFP_NOWARN);
- if (!smap->buckets) {
- bpf_map_charge_finish(&smap->map.memory);
- kfree(smap);
- return ERR_PTR(-ENOMEM);
- }
-
- for (i = 0; i < nbuckets; i++) {
- INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
- }
-
- smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
- smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
- BPF_SK_STORAGE_CACHE_SIZE;
+ smap = bpf_local_storage_map_alloc(attr);
+ if (IS_ERR(smap))
+ return ERR_CAST(smap);
+ smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache);
return &smap->map;
}
@@ -681,33 +112,16 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
-static int bpf_sk_storage_map_check_btf(const struct bpf_map *map,
- const struct btf *btf,
- const struct btf_type *key_type,
- const struct btf_type *value_type)
-{
- u32 int_data;
-
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
- return -EINVAL;
-
- return 0;
-}
-
static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
struct socket *sock;
int fd, err;
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
- sdata = sk_storage_lookup(sock->sk, map, true);
+ sdata = bpf_sk_storage_lookup(sock->sk, map, true);
sockfd_put(sock);
return sdata ? sdata->data : NULL;
}
@@ -718,14 +132,16 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
struct socket *sock;
int fd, err;
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
- sdata = sk_storage_update(sock->sk, map, value, map_flags);
+ sdata = bpf_local_storage_update(
+ sock->sk, (struct bpf_local_storage_map *)map, value,
+ map_flags, GFP_ATOMIC);
sockfd_put(sock);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -741,7 +157,7 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
fd = *(int *)key;
sock = sockfd_lookup(fd, &err);
if (sock) {
- err = sk_storage_delete(sock->sk, map);
+ err = bpf_sk_storage_del(sock->sk, map);
sockfd_put(sock);
return err;
}
@@ -749,14 +165,14 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
-static struct bpf_sk_storage_elem *
+static struct bpf_local_storage_elem *
bpf_sk_storage_clone_elem(struct sock *newsk,
- struct bpf_sk_storage_map *smap,
- struct bpf_sk_storage_elem *selem)
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
{
- struct bpf_sk_storage_elem *copy_selem;
+ struct bpf_local_storage_elem *copy_selem;
- copy_selem = selem_alloc(smap, newsk, NULL, true);
+ copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC);
if (!copy_selem)
return NULL;
@@ -772,9 +188,9 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
{
- struct bpf_sk_storage *new_sk_storage = NULL;
- struct bpf_sk_storage *sk_storage;
- struct bpf_sk_storage_elem *selem;
+ struct bpf_local_storage *new_sk_storage = NULL;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
int ret = 0;
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
@@ -786,8 +202,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
goto out;
hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
- struct bpf_sk_storage_elem *copy_selem;
- struct bpf_sk_storage_map *smap;
+ struct bpf_local_storage_elem *copy_selem;
+ struct bpf_local_storage_map *smap;
struct bpf_map *map;
smap = rcu_dereference(SDATA(selem)->smap);
@@ -795,7 +211,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
continue;
/* Note that for lockless listeners adding new element
- * here can race with cleanup in bpf_sk_storage_map_free.
+ * here can race with cleanup in bpf_local_storage_map_free.
* Try to grab map refcnt to make sure that it's still
* alive and prevent concurrent removal.
*/
@@ -811,10 +227,10 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
}
if (new_sk_storage) {
- selem_link_map(smap, copy_selem);
- __selem_link_sk(new_sk_storage, copy_selem);
+ bpf_selem_link_map(smap, copy_selem);
+ bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
} else {
- ret = sk_storage_alloc(newsk, smap, copy_selem);
+ ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
if (ret) {
kfree(copy_selem);
atomic_sub(smap->elem_size,
@@ -823,7 +239,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
goto out;
}
- new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+ new_sk_storage =
+ rcu_dereference(copy_selem->local_storage);
}
bpf_map_put(map);
}
@@ -838,15 +255,17 @@ out:
return ret;
}
-BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
- void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags, gfp_t, gfp_flags)
{
- struct bpf_sk_storage_data *sdata;
+ struct bpf_local_storage_data *sdata;
- if (flags > BPF_SK_STORAGE_GET_F_CREATE)
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
return (unsigned long)NULL;
- sdata = sk_storage_lookup(sk, map, true);
+ sdata = bpf_sk_storage_lookup(sk, map, true);
if (sdata)
return (unsigned long)sdata->data;
@@ -857,7 +276,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
* destruction).
*/
refcount_inc_not_zero(&sk->sk_refcnt)) {
- sdata = sk_storage_update(sk, map, value, BPF_NOEXIST);
+ sdata = bpf_local_storage_update(
+ sk, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST, gfp_flags);
/* sk must be a fullsock (guaranteed by verifier),
* so sock_gen_put() is unnecessary.
*/
@@ -871,10 +292,14 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!sk || !sk_fullsock(sk))
+ return -EINVAL;
+
if (refcount_inc_not_zero(&sk->sk_refcnt)) {
int err;
- err = sk_storage_delete(sk, map);
+ err = bpf_sk_storage_del(sk, map);
sock_put(sk);
return err;
}
@@ -882,15 +307,53 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
return -ENOENT;
}
+static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ int optmem_max = READ_ONCE(sysctl_optmem_max);
+ struct sock *sk = (struct sock *)owner;
+
+ /* same check as in sock_kmalloc() */
+ if (size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
+ atomic_add(size, &sk->sk_omem_alloc);
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ struct sock *sk = owner;
+
+ atomic_sub(size, &sk->sk_omem_alloc);
+}
+
+static struct bpf_local_storage __rcu **
+bpf_sk_storage_ptr(void *owner)
+{
+ struct sock *sk = owner;
+
+ return &sk->sk_bpf_storage;
+}
+
+BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map)
const struct bpf_map_ops sk_storage_map_ops = {
- .map_alloc_check = bpf_sk_storage_map_alloc_check,
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
.map_alloc = bpf_sk_storage_map_alloc,
.map_free = bpf_sk_storage_map_free,
.map_get_next_key = notsupp_get_next_key,
.map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
.map_update_elem = bpf_fd_sk_storage_update_elem,
.map_delete_elem = bpf_fd_sk_storage_delete_elem,
- .map_check_btf = bpf_sk_storage_map_check_btf,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_id = &sk_storage_map_btf_ids[0],
+ .map_local_storage_charge = bpf_sk_storage_charge,
+ .map_local_storage_uncharge = bpf_sk_storage_uncharge,
+ .map_owner_storage_ptr = bpf_sk_storage_ptr,
};
const struct bpf_func_proto bpf_sk_storage_get_proto = {
@@ -898,7 +361,17 @@ const struct bpf_func_proto bpf_sk_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_SOCKET,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = {
+ .func = bpf_sk_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_CTX, /* context is 'struct sock' */
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
@@ -908,5 +381,582 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_SOCKET,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
+static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
+{
+ const struct btf *btf_vmlinux;
+ const struct btf_type *t;
+ const char *tname;
+ u32 btf_id;
+
+ if (prog->aux->dst_prog)
+ return false;
+
+ /* Ensure the tracing program is not tracing
+ * any bpf_sk_storage*() function and also
+ * use the bpf_sk_storage_(get|delete) helper.
+ */
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_ITER:
+ case BPF_TRACE_RAW_TP:
+ /* bpf_sk_storage has no trace point */
+ return true;
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ btf_vmlinux = bpf_get_btf_vmlinux();
+ if (IS_ERR_OR_NULL(btf_vmlinux))
+ return false;
+ btf_id = prog->aux->attach_btf_id;
+ t = btf_type_by_id(btf_vmlinux, btf_id);
+ tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+ return !!strncmp(tname, "bpf_sk_storage",
+ strlen("bpf_sk_storage"));
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (in_hardirq() || in_nmi())
+ return (unsigned long)NULL;
+
+ return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags,
+ gfp_flags);
+}
+
+BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
+ struct sock *, sk)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (in_hardirq() || in_nmi())
+ return -EPERM;
+
+ return ____bpf_sk_storage_delete(map, sk);
+}
+
+const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = {
+ .func = bpf_sk_storage_get_tracing,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+ .allowed = bpf_sk_storage_tracing_allowed,
+};
+
+const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = {
+ .func = bpf_sk_storage_delete_tracing,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ .allowed = bpf_sk_storage_tracing_allowed,
};
+
+struct bpf_sk_storage_diag {
+ u32 nr_maps;
+ struct bpf_map *maps[];
+};
+
+/* The reply will be like:
+ * INET_DIAG_BPF_SK_STORAGES (nla_nest)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * ....
+ */
+static int nla_value_size(u32 value_size)
+{
+ /* SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ */
+ return nla_total_size(0) + nla_total_size(sizeof(u32)) +
+ nla_total_size_64bit(value_size);
+}
+
+void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
+{
+ u32 i;
+
+ if (!diag)
+ return;
+
+ for (i = 0; i < diag->nr_maps; i++)
+ bpf_map_put(diag->maps[i]);
+
+ kfree(diag);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
+
+static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
+ const struct bpf_map *map)
+{
+ u32 i;
+
+ for (i = 0; i < diag->nr_maps; i++) {
+ if (diag->maps[i] == map)
+ return true;
+ }
+
+ return false;
+}
+
+struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
+{
+ struct bpf_sk_storage_diag *diag;
+ struct nlattr *nla;
+ u32 nr_maps = 0;
+ int rem, err;
+
+ /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as
+ * the map_alloc_check() side also does.
+ */
+ if (!bpf_capable())
+ return ERR_PTR(-EPERM);
+
+ nla_for_each_nested(nla, nla_stgs, rem) {
+ if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+ nr_maps++;
+ }
+
+ diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);
+ if (!diag)
+ return ERR_PTR(-ENOMEM);
+
+ nla_for_each_nested(nla, nla_stgs, rem) {
+ struct bpf_map *map;
+ int map_fd;
+
+ if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+ continue;
+
+ map_fd = nla_get_u32(nla);
+ map = bpf_map_get(map_fd);
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto err_free;
+ }
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
+ bpf_map_put(map);
+ err = -EINVAL;
+ goto err_free;
+ }
+ if (diag_check_dup(diag, map)) {
+ bpf_map_put(map);
+ err = -EEXIST;
+ goto err_free;
+ }
+ diag->maps[diag->nr_maps++] = map;
+ }
+
+ return diag;
+
+err_free:
+ bpf_sk_storage_diag_free(diag);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
+
+static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
+{
+ struct nlattr *nla_stg, *nla_value;
+ struct bpf_local_storage_map *smap;
+
+ /* It cannot exceed max nlattr's payload */
+ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
+
+ nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
+ if (!nla_stg)
+ return -EMSGSIZE;
+
+ smap = rcu_dereference(sdata->smap);
+ if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
+ goto errout;
+
+ nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
+ smap->map.value_size,
+ SK_DIAG_BPF_STORAGE_PAD);
+ if (!nla_value)
+ goto errout;
+
+ if (map_value_has_spin_lock(&smap->map))
+ copy_map_value_locked(&smap->map, nla_data(nla_value),
+ sdata->data, true);
+ else
+ copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+
+ nla_nest_end(skb, nla_stg);
+ return 0;
+
+errout:
+ nla_nest_cancel(skb, nla_stg);
+ return -EMSGSIZE;
+}
+
+static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+
+ rcu_read_lock();
+
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ smap = rcu_dereference(SDATA(selem)->smap);
+ diag_size += nla_value_size(smap->map.value_size);
+
+ if (nla_stgs && diag_get(SDATA(selem), skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+
+int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+ struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_data *sdata;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+ u32 i;
+
+ *res_diag_size = 0;
+
+ /* No map has been specified. Dump all. */
+ if (!diag->nr_maps)
+ return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
+ res_diag_size);
+
+ rcu_read_lock();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ for (i = 0; i < diag->nr_maps; i++) {
+ sdata = bpf_local_storage_lookup(sk_storage,
+ (struct bpf_local_storage_map *)diag->maps[i],
+ false);
+
+ if (!sdata)
+ continue;
+
+ diag_size += nla_value_size(diag->maps[i]->value_size);
+
+ if (nla_stgs && diag_get(sdata, skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);
+
+struct bpf_iter_seq_sk_storage_map_info {
+ struct bpf_map *map;
+ unsigned int bucket_id;
+ unsigned skip_elems;
+};
+
+static struct bpf_local_storage_elem *
+bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
+ struct bpf_local_storage_elem *prev_selem)
+ __acquires(RCU) __releases(RCU)
+{
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ u32 skip_elems = info->skip_elems;
+ struct bpf_local_storage_map *smap;
+ u32 bucket_id = info->bucket_id;
+ u32 i, count, n_buckets;
+ struct bpf_local_storage_map_bucket *b;
+
+ smap = (struct bpf_local_storage_map *)info->map;
+ n_buckets = 1U << smap->bucket_log;
+ if (bucket_id >= n_buckets)
+ return NULL;
+
+ /* try to find next selem in the same bucket */
+ selem = prev_selem;
+ count = 0;
+ while (selem) {
+ selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)),
+ struct bpf_local_storage_elem, map_node);
+ if (!selem) {
+ /* not found, unlock and go to the next bucket */
+ b = &smap->buckets[bucket_id++];
+ rcu_read_unlock();
+ skip_elems = 0;
+ break;
+ }
+ sk_storage = rcu_dereference(selem->local_storage);
+ if (sk_storage) {
+ info->skip_elems = skip_elems + count;
+ return selem;
+ }
+ count++;
+ }
+
+ for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+ rcu_read_lock();
+ count = 0;
+ hlist_for_each_entry_rcu(selem, &b->list, map_node) {
+ sk_storage = rcu_dereference(selem->local_storage);
+ if (sk_storage && count >= skip_elems) {
+ info->bucket_id = i;
+ info->skip_elems = count;
+ return selem;
+ }
+ count++;
+ }
+ rcu_read_unlock();
+ skip_elems = 0;
+ }
+
+ info->bucket_id = i;
+ info->skip_elems = 0;
+ return NULL;
+}
+
+static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL);
+ if (!selem)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return selem;
+}
+
+static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v,
+ loff_t *pos)
+{
+ struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+
+ ++*pos;
+ ++info->skip_elems;
+ return bpf_sk_storage_map_seq_find_next(seq->private, v);
+}
+
+struct bpf_iter__bpf_sk_storage_map {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+ __bpf_md_ptr(struct sock *, sk);
+ __bpf_md_ptr(void *, value);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta,
+ struct bpf_map *map, struct sock *sk,
+ void *value)
+
+static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+ struct bpf_iter__bpf_sk_storage_map ctx = {};
+ struct bpf_local_storage *sk_storage;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, selem == NULL);
+ if (prog) {
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (selem) {
+ sk_storage = rcu_dereference(selem->local_storage);
+ ctx.sk = sk_storage->owner;
+ ctx.value = SDATA(selem)->data;
+ }
+ ret = bpf_iter_run_prog(prog, &ctx);
+ }
+
+ return ret;
+}
+
+static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_sk_storage_map_seq_show(seq, v);
+}
+
+static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ if (!v)
+ (void)__bpf_sk_storage_map_seq_show(seq, v);
+ else
+ rcu_read_unlock();
+}
+
+static int bpf_iter_init_sk_storage_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data;
+
+ bpf_map_inc_with_uref(aux->map);
+ seq_info->map = aux->map;
+ return 0;
+}
+
+static void bpf_iter_fini_sk_storage_map(void *priv_data)
+{
+ struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data;
+
+ bpf_map_put_with_uref(seq_info->map);
+}
+
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_map *map;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+ goto put_map;
+
+ if (prog->aux->max_rdwr_access > map->value_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+static const struct seq_operations bpf_sk_storage_map_seq_ops = {
+ .start = bpf_sk_storage_map_seq_start,
+ .next = bpf_sk_storage_map_seq_next,
+ .stop = bpf_sk_storage_map_seq_stop,
+ .show = bpf_sk_storage_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_sk_storage_map_seq_ops,
+ .init_seq_private = bpf_iter_init_sk_storage_map,
+ .fini_seq_private = bpf_iter_fini_sk_storage_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info),
+};
+
+static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
+ .target = "bpf_sk_storage_map",
+ .attach_target = bpf_iter_attach_map,
+ .detach_target = bpf_iter_detach_map,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
+ PTR_TO_BTF_ID_OR_NULL },
+ { offsetof(struct bpf_iter__bpf_sk_storage_map, value),
+ PTR_TO_BUF | PTR_MAYBE_NULL },
+ },
+ .seq_info = &iter_seq_info,
+};
+
+static int __init bpf_sk_storage_map_iter_init(void)
+{
+ bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id =
+ btf_sock_ids[BTF_SOCK_TYPE_SOCK];
+ return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info);
+}
+late_initcall(bpf_sk_storage_map_iter_init);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a78e7f864c1e..e4ff2db40c98 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -61,8 +62,6 @@
#include <trace/events/skb.h>
#include <net/busy_poll.h>
-#include "datagram.h"
-
/*
* Is a socket 'connection oriented' ?
*/
@@ -166,8 +165,6 @@ done:
struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
struct sk_buff_head *queue,
unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
int *off, int *err,
struct sk_buff **last)
{
@@ -198,8 +195,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
refcount_inc(&skb->users);
} else {
__skb_unlink(skb, queue);
- if (destructor)
- destructor(sk, skb);
}
*off = _off;
return skb;
@@ -212,7 +207,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
* @sk: socket
* @queue: socket queue from which to receive
* @flags: MSG\_ flags
- * @destructor: invoked under the receive lock on successful dequeue
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
* @err: error code returned
@@ -245,10 +239,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
*/
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
struct sk_buff_head *queue,
- unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
- int *off, int *err,
+ unsigned int flags, int *off, int *err,
struct sk_buff **last)
{
struct sk_buff *skb;
@@ -269,8 +260,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
* However, this function was correct in any case. 8)
*/
spin_lock_irqsave(&queue->lock, cpu_flags);
- skb = __skb_try_recv_from_queue(sk, queue, flags, destructor,
- off, &error, last);
+ skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
+ last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (error)
goto no_packet;
@@ -293,10 +284,7 @@ EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
struct sk_buff_head *sk_queue,
- unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
- int *off, int *err)
+ unsigned int flags, int *off, int *err)
{
struct sk_buff *skb, *last;
long timeo;
@@ -304,8 +292,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, sk_queue, flags, destructor,
- off, err, &last);
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
+ &last);
if (skb)
return skb;
@@ -320,20 +308,18 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk,
EXPORT_SYMBOL(__skb_recv_datagram);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
- int noblock, int *err)
+ int *err)
{
int off = 0;
- return __skb_recv_datagram(sk, &sk->sk_receive_queue,
- flags | (noblock ? MSG_DONTWAIT : 0),
- NULL, &off, err);
+ return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags,
+ &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
consume_skb(skb);
- sk_mem_reclaim_partial(sk);
}
EXPORT_SYMBOL(skb_free_datagram);
@@ -349,7 +335,6 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
slow = lock_sock_fast(sk);
sk_peek_offset_bwd(sk, len);
skb_orphan(skb);
- sk_mem_reclaim_partial(sk);
unlock_sock_fast(sk, slow);
/* skb is now orphaned, can be freed outside of locked section */
@@ -409,11 +394,15 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
NULL);
kfree_skb(skb);
- sk_mem_reclaim_partial(sk);
return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
+INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
+ size_t bytes,
+ void *data __always_unused,
+ struct iov_iter *i));
+
static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len, bool fault_short,
size_t (*cb)(const void *, size_t, void *,
@@ -427,7 +416,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- n = cb(skb->data + offset, copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ skb->data + offset, copy, data, to);
offset += n;
if (n != copy)
goto short_copy;
@@ -449,8 +439,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
- n = cb(vaddr + skb_frag_off(frag) + offset - start,
- copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ vaddr + skb_frag_off(frag) + offset - start,
+ copy, data, to);
kunmap(page);
offset += n;
if (n != copy)
@@ -619,27 +610,33 @@ fault:
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
-int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
- struct iov_iter *from, size_t length)
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, struct iov_iter *from,
+ size_t length)
{
- int frag = skb_shinfo(skb)->nr_frags;
+ int frag;
+
+ if (msg && msg->msg_ubuf && msg->sg_from_iter)
+ return msg->sg_from_iter(sk, skb, from, length);
+
+ frag = skb_shinfo(skb)->nr_frags;
while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
+ struct page *last_head = NULL;
size_t start;
ssize_t copied;
unsigned long truesize;
- int n = 0;
+ int refs, n = 0;
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
- copied = iov_iter_get_pages(from, pages, length,
+ copied = iov_iter_get_pages2(from, pages, length,
MAX_SKB_FRAGS - frag, &start);
if (copied < 0)
return -EFAULT;
- iov_iter_advance(from, copied);
length -= copied;
truesize = PAGE_ALIGN(copied + start);
@@ -648,17 +645,42 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
skb->truesize += truesize;
if (sk && sk->sk_type == SOCK_STREAM) {
sk_wmem_queued_add(sk, truesize);
- sk_mem_charge(sk, truesize);
+ if (!skb_zcopy_pure(skb))
+ sk_mem_charge(sk, truesize);
} else {
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
}
- while (copied) {
+ for (refs = 0; copied != 0; start = 0) {
int size = min_t(int, copied, PAGE_SIZE - start);
- skb_fill_page_desc(skb, frag++, pages[n], start, size);
- start = 0;
+ struct page *head = compound_head(pages[n]);
+
+ start += (pages[n] - head) << PAGE_SHIFT;
copied -= size;
n++;
+ if (frag) {
+ skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1];
+
+ if (head == skb_frag_page(last) &&
+ start == skb_frag_off(last) + skb_frag_size(last)) {
+ skb_frag_size_add(last, size);
+ /* We combined this page, we need to release
+ * a reference. Since compound pages refcount
+ * is shared among many pages, batch the refcount
+ * adjustments to limit false sharing.
+ */
+ last_head = head;
+ refs++;
+ continue;
+ }
+ }
+ if (refs) {
+ page_ref_sub(last_head, refs);
+ refs = 0;
+ }
+ skb_fill_page_desc_noacc(skb, frag++, head, start, size);
}
+ if (refs)
+ page_ref_sub(last_head, refs);
}
return 0;
}
@@ -682,12 +704,12 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
- return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+ return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
/**
- * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator
+ * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
* and update a checksum.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
@@ -699,8 +721,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len,
__wsum *csump)
{
- return __skb_datagram_iter(skb, offset, to, len, true,
- csum_and_copy_to_iter, csump);
+ struct csum_state csdata = { .csum = *csump };
+ int ret;
+
+ ret = __skb_datagram_iter(skb, offset, to, len, true,
+ csum_and_copy_to_iter, &csdata);
+ if (ret)
+ return ret;
+
+ *csump = csdata.csum;
+ return 0;
}
/**
diff --git a/net/core/datagram.h b/net/core/datagram.h
deleted file mode 100644
index bcfb75bfa3b2..000000000000
--- a/net/core/datagram.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _NET_CORE_DATAGRAM_H_
-#define _NET_CORE_DATAGRAM_H_
-
-#include <linux/types.h>
-
-struct sock;
-struct sk_buff;
-struct iov_iter;
-
-int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
- struct iov_iter *from, size_t length);
-
-#endif /* _NET_CORE_DATAGRAM_H_ */
diff --git a/net/core/dev.c b/net/core/dev.c
index c6c985fe7b1b..3be256051e99 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -79,6 +79,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -90,6 +91,7 @@
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/skbuff.h>
+#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
@@ -97,8 +99,10 @@
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
+#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/gro.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
@@ -127,6 +131,7 @@
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
+#include <trace/events/qdisc.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
@@ -135,26 +140,24 @@
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
-#include <linux/netfilter_ingress.h>
+#include <linux/netfilter_netdev.h>
#include <linux/crash_dump.h>
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <net/devlink.h>
+#include <linux/pm_runtime.h>
+#include <linux/prandom.h>
+#include <linux/once_lite.h>
+#include "dev.h"
#include "net-sysfs.h"
-#define MAX_GRO_SKBS 8
-
-/* This should be increased if a protocol with a bigger head is added. */
-#define GRO_MAX_HEAD (MAX_HEADER + 128)
static DEFINE_SPINLOCK(ptype_lock);
-static DEFINE_SPINLOCK(offload_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
struct list_head ptype_all __read_mostly; /* Taps */
-static struct list_head offload_base __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_info(unsigned long val,
@@ -194,7 +197,7 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static seqcount_t devnet_rename_seq;
+static DECLARE_RWSEM(devnet_rename_sem);
static inline void dev_base_seq_inc(struct net *net)
{
@@ -214,18 +217,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
-static inline void rps_lock(struct softnet_data *sd)
+static inline void rps_lock_irqsave(struct softnet_data *sd,
+ unsigned long *flags)
{
-#ifdef CONFIG_RPS
- spin_lock(&sd->input_pkt_queue.lock);
-#endif
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_save(*flags);
}
-static inline void rps_unlock(struct softnet_data *sd)
+static inline void rps_lock_irq_disable(struct softnet_data *sd)
{
-#ifdef CONFIG_RPS
- spin_unlock(&sd->input_pkt_queue.lock);
-#endif
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_lock_irq(&sd->input_pkt_queue.lock);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
+}
+
+static inline void rps_unlock_irq_restore(struct softnet_data *sd,
+ unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_restore(*flags);
+}
+
+static inline void rps_unlock_irq_enable(struct softnet_data *sd)
+{
+ if (IS_ENABLED(CONFIG_RPS))
+ spin_unlock_irq(&sd->input_pkt_queue.lock);
+ else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
}
static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
@@ -295,6 +318,12 @@ static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
return NULL;
}
+bool netdev_name_in_use(struct net *net, const char *name)
+{
+ return netdev_name_node_lookup(net, name);
+}
+EXPORT_SYMBOL(netdev_name_in_use);
+
int netdev_name_node_alt_create(struct net_device *dev, const char *name)
{
struct netdev_name_node *name_node;
@@ -312,7 +341,6 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name)
return 0;
}
-EXPORT_SYMBOL(netdev_name_node_alt_create);
static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
@@ -340,7 +368,6 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
return 0;
}
-EXPORT_SYMBOL(netdev_name_node_alt_destroy);
static void netdev_name_node_alt_flush(struct net_device *dev)
{
@@ -357,12 +384,12 @@ static void list_netdevice(struct net_device *dev)
ASSERT_RTNL();
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
netdev_name_node_add(net, dev->name_node);
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
dev_base_seq_inc(net);
}
@@ -370,16 +397,18 @@ static void list_netdevice(struct net_device *dev)
/* Device list removal
* caller must respect a RCU grace period before freeing/reusing dev
*/
-static void unlist_netdevice(struct net_device *dev)
+static void unlist_netdevice(struct net_device *dev, bool lock)
{
ASSERT_RTNL();
/* Unlink dev from the device chain */
- write_lock_bh(&dev_base_lock);
+ if (lock)
+ write_lock(&dev_base_lock);
list_del_rcu(&dev->dev_list);
netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
- write_unlock_bh(&dev_base_lock);
+ if (lock)
+ write_unlock(&dev_base_lock);
dev_base_seq_inc(dev_net(dev));
}
@@ -398,6 +427,89 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
EXPORT_PER_CPU_SYMBOL(softnet_data);
+#ifdef CONFIG_LOCKDEP
+/*
+ * register_netdevice() inits txq->_xmit_lock and sets lockdep class
+ * according to dev->type
+ */
+static const unsigned short netdev_lock_type[] = {
+ ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
+ ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
+ ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
+ ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
+ ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
+ ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
+ ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
+ ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
+ ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
+ ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
+ ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
+ ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
+ ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
+ ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
+ ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
+
+static const char *const netdev_lock_name[] = {
+ "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
+ "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
+ "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
+ "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
+ "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
+ "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
+ "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
+ "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
+ "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
+ "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
+ "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
+ "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
+ "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
+ "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
+ "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
+
+static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
+static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
+
+static inline unsigned short netdev_lock_pos(unsigned short dev_type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
+ if (netdev_lock_type[i] == dev_type)
+ return i;
+ /* the last key is used by default */
+ return ARRAY_SIZE(netdev_lock_type) - 1;
+}
+
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+ int i;
+
+ i = netdev_lock_pos(dev_type);
+ lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
+ netdev_lock_name[i]);
+}
+
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+ int i;
+
+ i = netdev_lock_pos(dev->type);
+ lockdep_set_class_and_name(&dev->addr_list_lock,
+ &netdev_addr_lock_key[i],
+ netdev_lock_name[i]);
+}
+#else
+static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
+ unsigned short dev_type)
+{
+}
+
+static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
+{
+}
+#endif
+
/*******************************************************************************
*
* Protocol management and registration routines
@@ -507,209 +619,6 @@ void dev_remove_pack(struct packet_type *pt)
EXPORT_SYMBOL(dev_remove_pack);
-/**
- * dev_add_offload - register offload handlers
- * @po: protocol offload declaration
- *
- * Add protocol offload handlers to the networking stack. The passed
- * &proto_offload is linked into kernel lists and may not be freed until
- * it has been removed from the kernel lists.
- *
- * This call does not sleep therefore it can not
- * guarantee all CPU's that are in middle of receiving packets
- * will see the new offload handlers (until the next received packet).
- */
-void dev_add_offload(struct packet_offload *po)
-{
- struct packet_offload *elem;
-
- spin_lock(&offload_lock);
- list_for_each_entry(elem, &offload_base, list) {
- if (po->priority < elem->priority)
- break;
- }
- list_add_rcu(&po->list, elem->list.prev);
- spin_unlock(&offload_lock);
-}
-EXPORT_SYMBOL(dev_add_offload);
-
-/**
- * __dev_remove_offload - remove offload handler
- * @po: packet offload declaration
- *
- * Remove a protocol offload handler that was previously added to the
- * kernel offload handlers by dev_add_offload(). The passed &offload_type
- * is removed from the kernel lists and can be freed or reused once this
- * function returns.
- *
- * The packet type might still be in use by receivers
- * and must not be freed until after all the CPU's have gone
- * through a quiescent state.
- */
-static void __dev_remove_offload(struct packet_offload *po)
-{
- struct list_head *head = &offload_base;
- struct packet_offload *po1;
-
- spin_lock(&offload_lock);
-
- list_for_each_entry(po1, head, list) {
- if (po == po1) {
- list_del_rcu(&po->list);
- goto out;
- }
- }
-
- pr_warn("dev_remove_offload: %p not found\n", po);
-out:
- spin_unlock(&offload_lock);
-}
-
-/**
- * dev_remove_offload - remove packet offload handler
- * @po: packet offload declaration
- *
- * Remove a packet offload handler that was previously added to the kernel
- * offload handlers by dev_add_offload(). The passed &offload_type is
- * removed from the kernel lists and can be freed or reused once this
- * function returns.
- *
- * This call sleeps to guarantee that no CPU is looking at the packet
- * type after return.
- */
-void dev_remove_offload(struct packet_offload *po)
-{
- __dev_remove_offload(po);
-
- synchronize_net();
-}
-EXPORT_SYMBOL(dev_remove_offload);
-
-/******************************************************************************
- *
- * Device Boot-time Settings Routines
- *
- ******************************************************************************/
-
-/* Boot time configuration table */
-static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
-
-/**
- * netdev_boot_setup_add - add new setup entry
- * @name: name of the device
- * @map: configured settings for the device
- *
- * Adds new setup entry to the dev_boot_setup list. The function
- * returns 0 on error and 1 on success. This is a generic routine to
- * all netdevices.
- */
-static int netdev_boot_setup_add(char *name, struct ifmap *map)
-{
- struct netdev_boot_setup *s;
- int i;
-
- s = dev_boot_setup;
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
- if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
- memset(s[i].name, 0, sizeof(s[i].name));
- strlcpy(s[i].name, name, IFNAMSIZ);
- memcpy(&s[i].map, map, sizeof(s[i].map));
- break;
- }
- }
-
- return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
-}
-
-/**
- * netdev_boot_setup_check - check boot time settings
- * @dev: the netdevice
- *
- * Check boot time settings for the device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found, 1 if they are.
- */
-int netdev_boot_setup_check(struct net_device *dev)
-{
- struct netdev_boot_setup *s = dev_boot_setup;
- int i;
-
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
- if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
- !strcmp(dev->name, s[i].name)) {
- dev->irq = s[i].map.irq;
- dev->base_addr = s[i].map.base_addr;
- dev->mem_start = s[i].map.mem_start;
- dev->mem_end = s[i].map.mem_end;
- return 1;
- }
- }
- return 0;
-}
-EXPORT_SYMBOL(netdev_boot_setup_check);
-
-
-/**
- * netdev_boot_base - get address from boot time settings
- * @prefix: prefix for network device
- * @unit: id for network device
- *
- * Check boot time settings for the base address of device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found.
- */
-unsigned long netdev_boot_base(const char *prefix, int unit)
-{
- const struct netdev_boot_setup *s = dev_boot_setup;
- char name[IFNAMSIZ];
- int i;
-
- sprintf(name, "%s%d", prefix, unit);
-
- /*
- * If device already registered then return base of 1
- * to indicate not to probe for this interface
- */
- if (__dev_get_by_name(&init_net, name))
- return 1;
-
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
- if (!strcmp(name, s[i].name))
- return s[i].map.base_addr;
- return 0;
-}
-
-/*
- * Saves at boot time configured settings for any netdevice.
- */
-int __init netdev_boot_setup(char *str)
-{
- int ints[5];
- struct ifmap map;
-
- str = get_options(str, ARRAY_SIZE(ints), ints);
- if (!str || !*str)
- return 0;
-
- /* Save settings */
- memset(&map, 0, sizeof(map));
- if (ints[0] > 0)
- map.irq = ints[1];
- if (ints[0] > 1)
- map.base_addr = ints[2];
- if (ints[0] > 2)
- map.mem_start = ints[3];
- if (ints[0] > 3)
- map.mem_end = ints[4];
-
- /* Add new entry to the list */
- return netdev_boot_setup_add(str, &map);
-}
-
-__setup("netdev=", netdev_boot_setup);
-
/*******************************************************************************
*
* Device Interface Subroutines
@@ -759,6 +668,56 @@ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
+{
+ int k = stack->num_paths++;
+
+ if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
+ return NULL;
+
+ return &stack->path[k];
+}
+
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ struct net_device_path_stack *stack)
+{
+ const struct net_device *last_dev;
+ struct net_device_path_ctx ctx = {
+ .dev = dev,
+ };
+ struct net_device_path *path;
+ int ret = 0;
+
+ memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
+ stack->num_paths = 0;
+ while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
+ last_dev = ctx.dev;
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+
+ memset(path, 0, sizeof(struct net_device_path));
+ ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
+ if (ret < 0)
+ return -1;
+
+ if (WARN_ON_ONCE(last_dev == ctx.dev))
+ return -1;
+ }
+
+ if (!ctx.dev)
+ return ret;
+
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+ path->type = DEV_PATH_ETHERNET;
+ path->dev = ctx.dev;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
@@ -819,8 +778,7 @@ struct net_device *dev_get_by_name(struct net *net, const char *name)
rcu_read_lock();
dev = dev_get_by_name_rcu(net, name);
- if (dev)
- dev_hold(dev);
+ dev_hold(dev);
rcu_read_unlock();
return dev;
}
@@ -893,8 +851,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
- if (dev)
- dev_hold(dev);
+ dev_hold(dev);
rcu_read_unlock();
return dev;
}
@@ -930,33 +887,28 @@ EXPORT_SYMBOL(dev_get_by_napi_id);
* @net: network namespace
* @name: a pointer to the buffer where the name will be stored.
* @ifindex: the ifindex of the interface to get the name from.
- *
- * The use of raw_seqcount_begin() and cond_resched() before
- * retrying is required as we want to give the writers a chance
- * to complete when CONFIG_PREEMPTION is not set.
*/
int netdev_get_name(struct net *net, char *name, int ifindex)
{
struct net_device *dev;
- unsigned int seq;
+ int ret;
-retry:
- seq = raw_seqcount_begin(&devnet_rename_seq);
+ down_read(&devnet_rename_sem);
rcu_read_lock();
+
dev = dev_get_by_index_rcu(net, ifindex);
if (!dev) {
- rcu_read_unlock();
- return -ENODEV;
+ ret = -ENODEV;
+ goto out;
}
strcpy(name, dev->name);
- rcu_read_unlock();
- if (read_seqcount_retry(&devnet_rename_seq, seq)) {
- cond_resched();
- goto retry;
- }
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ up_read(&devnet_rename_sem);
+ return ret;
}
/**
@@ -987,19 +939,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
-struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
-{
- struct net_device *dev;
-
- ASSERT_RTNL();
- for_each_netdev(net, dev)
- if (dev->type == type)
- return dev;
-
- return NULL;
-}
-EXPORT_SYMBOL(__dev_getfirstbyhwtype);
-
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
@@ -1050,7 +989,7 @@ EXPORT_SYMBOL(__dev_get_by_flags);
* @name: name string
*
* Network device names need to be valid file names to
- * to allow sysfs to work. We also disallow any kind of
+ * allow sysfs to work. We also disallow any kind of
* whitespace.
*/
bool dev_valid_name(const char *name)
@@ -1113,6 +1052,18 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
return -ENOMEM;
for_each_netdev(net, d) {
+ struct netdev_name_node *name_node;
+ list_for_each_entry(name_node, &d->name_node->list, list) {
+ if (!sscanf(name_node->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
+
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
+ __set_bit(i, inuse);
+ }
if (!sscanf(d->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
@@ -1121,7 +1072,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
/* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
if (!strncmp(buf, d->name, IFNAMSIZ))
- set_bit(i, inuse);
+ __set_bit(i, inuse);
}
i = find_first_zero_bit(inuse, max_netdevices);
@@ -1129,7 +1080,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
}
snprintf(buf, IFNAMSIZ, name, i);
- if (!__dev_get_by_name(net, buf))
+ if (!netdev_name_in_use(net, buf))
return i;
/* It is possible to run out of possible slots
@@ -1149,7 +1100,7 @@ static int dev_alloc_name_ns(struct net *net,
BUG_ON(!net);
ret = __dev_alloc_name(net, name, buf);
if (ret >= 0)
- strlcpy(dev->name, buf, IFNAMSIZ);
+ strscpy(dev->name, buf, IFNAMSIZ);
return ret;
}
@@ -1183,10 +1134,10 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev,
if (strchr(name, '%'))
return dev_alloc_name_ns(net, dev, name);
- else if (__dev_get_by_name(net, name))
+ else if (netdev_name_in_use(net, name))
return -EEXIST;
else if (dev->name != name)
- strlcpy(dev->name, name, IFNAMSIZ);
+ strscpy(dev->name, name, IFNAMSIZ);
return 0;
}
@@ -1228,10 +1179,10 @@ int dev_change_name(struct net_device *dev, const char *newname)
likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
return -EBUSY;
- write_seqcount_begin(&devnet_rename_seq);
+ down_write(&devnet_rename_sem);
if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
return 0;
}
@@ -1239,7 +1190,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
err = dev_get_valid_name(net, dev, newname);
if (err < 0) {
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
return err;
}
@@ -1254,23 +1205,23 @@ rollback:
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
return ret;
}
- write_seqcount_end(&devnet_rename_seq);
+ up_write(&devnet_rename_sem);
netdev_adjacent_rename_links(dev, oldname);
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
netdev_name_node_del(dev->name_node);
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
synchronize_rcu();
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
netdev_name_node_add(net, dev->name_node);
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
ret = notifier_to_errno(ret);
@@ -1279,15 +1230,15 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- write_seqcount_begin(&devnet_rename_seq);
+ down_write(&devnet_rename_sem);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
old_assign_type = NET_NAME_RENAMED;
goto rollback;
} else {
- pr_err("%s: name change rollback failed: %d\n",
- dev->name, ret);
+ netdev_err(dev, "name change rollback failed: %d\n",
+ ret);
}
}
@@ -1388,6 +1339,25 @@ void netdev_state_change(struct net_device *dev)
EXPORT_SYMBOL(netdev_state_change);
/**
+ * __netdev_notify_peers - notify network peers about existence of @dev,
+ * to be called when rtnl lock is already held.
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void __netdev_notify_peers(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+}
+EXPORT_SYMBOL(__netdev_notify_peers);
+
+/**
* netdev_notify_peers - notify network peers about existence of @dev
* @dev: network device
*
@@ -1400,21 +1370,47 @@ EXPORT_SYMBOL(netdev_state_change);
void netdev_notify_peers(struct net_device *dev)
{
rtnl_lock();
- call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
- call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+ __netdev_notify_peers(dev);
rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);
+static int napi_threaded_poll(void *data);
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+ int err = 0;
+
+ /* Create and wake up the kthread once to put it in
+ * TASK_INTERRUPTIBLE mode to avoid the blocked task
+ * warning and work with loadavg.
+ */
+ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+ n->dev->name, n->napi_id);
+ if (IS_ERR(n->thread)) {
+ err = PTR_ERR(n->thread);
+ pr_err("kthread_run failed with err %d\n", err);
+ n->thread = NULL;
+ }
+
+ return err;
+}
+
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int ret;
ASSERT_RTNL();
+ dev_addr_check(dev);
- if (!netif_device_present(dev))
- return -ENODEV;
+ if (!netif_device_present(dev)) {
+ /* may be detached because parent is runtime-suspended */
+ if (dev->dev.parent)
+ pm_runtime_resume(dev->dev.parent);
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ }
/* Block netpoll from trying to do any rx path servicing.
* If we don't do this there is a chance ndo_poll_controller
@@ -1631,7 +1627,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
- N(PRE_CHANGEADDR)
+ N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
+ N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
}
#undef N
return "UNKNOWN_NETDEV_EVENT";
@@ -1948,6 +1945,32 @@ static int call_netdevice_notifiers_info(unsigned long val,
return raw_notifier_call_chain(&netdev_chain, val, info);
}
+/**
+ * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
+ * for and rollback on error
+ * @val_up: value passed unmodified to notifier function
+ * @val_down: value passed unmodified to the notifier function when
+ * recovering from an error on @val_up
+ * @info: notifier information data
+ *
+ * Call all per-netns network notifier blocks, but not notifier blocks on
+ * the global notifier chain. Parameters and return value are as for
+ * raw_notifier_call_chain_robust().
+ */
+
+static int
+call_netdevice_notifiers_info_robust(unsigned long val_up,
+ unsigned long val_down,
+ struct netdev_notifier_info *info)
+{
+ struct net *net = dev_net(info->dev);
+
+ ASSERT_RTNL();
+
+ return raw_notifier_call_chain_robust(&net->netdev_chain,
+ val_up, val_down, info);
+}
+
static int call_netdevice_notifiers_extack(unsigned long val,
struct net_device *dev,
struct netlink_ext_ack *extack)
@@ -2029,7 +2052,8 @@ void net_dec_egress_queue(void)
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
-static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+EXPORT_SYMBOL(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
@@ -2090,40 +2114,27 @@ EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
+ skb->mono_delivery_time = 0;
if (static_branch_unlikely(&netstamp_needed_key))
- __net_timestamp(skb);
+ skb->tstamp = ktime_get_real();
}
#define net_timestamp_check(COND, SKB) \
if (static_branch_unlikely(&netstamp_needed_key)) { \
if ((COND) && !(SKB)->tstamp) \
- __net_timestamp(SKB); \
+ (SKB)->tstamp = ktime_get_real(); \
} \
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
- unsigned int len;
-
- if (!(dev->flags & IFF_UP))
- return false;
-
- len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
- if (skb->len <= len)
- return true;
-
- /* if TSO is enabled, we don't care about the length as the packet
- * could be forwarded without being segmented before
- */
- if (skb_is_gso(skb))
- return true;
-
- return false;
+ return __is_skb_forwardable(dev, skb, true);
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);
-int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
+ bool check_mtu)
{
- int ret = ____dev_forward_skb(dev, skb);
+ int ret = ____dev_forward_skb(dev, skb, check_mtu);
if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
@@ -2132,6 +2143,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
return ret;
}
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb2(dev, skb, true);
+}
EXPORT_SYMBOL_GPL(__dev_forward_skb);
/**
@@ -2158,6 +2174,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
+int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
+}
+
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
@@ -2301,7 +2322,7 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
/* If TC0 is invalidated disable TC mapping */
if (tc->offset + tc->count > txq) {
- pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
+ netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
dev->num_tc = 0;
return;
}
@@ -2312,8 +2333,8 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
tc = &dev->tc_to_txq[q];
if (tc->offset + tc->count > txq) {
- pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
- i, q);
+ netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
+ i, q);
netdev_set_prio_tc_map(dev, i, 0);
}
}
@@ -2340,16 +2361,14 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
EXPORT_SYMBOL(netdev_txq_to_tc);
#ifdef CONFIG_XPS
-struct static_key xps_needed __read_mostly;
-EXPORT_SYMBOL(xps_needed);
-struct static_key xps_rxqs_needed __read_mostly;
-EXPORT_SYMBOL(xps_rxqs_needed);
+static struct static_key xps_needed __read_mostly;
+static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
- int tci, u16 index)
+ struct xps_dev_maps *old_maps, int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
@@ -2368,6 +2387,8 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
break;
}
+ if (old_maps)
+ RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
@@ -2380,7 +2401,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
struct xps_dev_maps *dev_maps,
int cpu, u16 offset, u16 count)
{
- int num_tc = dev->num_tc ? : 1;
+ int num_tc = dev_maps->num_tc;
bool active = false;
int tci;
@@ -2388,7 +2409,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
int i, j;
for (i = count, j = offset; i--; j++) {
- if (!remove_xps_queue(dev_maps, tci, j))
+ if (!remove_xps_queue(dev_maps, NULL, tci, j))
break;
}
@@ -2400,74 +2421,54 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
static void reset_xps_maps(struct net_device *dev,
struct xps_dev_maps *dev_maps,
- bool is_rxqs_map)
+ enum xps_map_type type)
{
- if (is_rxqs_map) {
- static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
- RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
- } else {
- RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
- }
static_key_slow_dec_cpuslocked(&xps_needed);
+ if (type == XPS_RXQS)
+ static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+
+ RCU_INIT_POINTER(dev->xps_maps[type], NULL);
+
kfree_rcu(dev_maps, rcu);
}
-static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
- struct xps_dev_maps *dev_maps, unsigned int nr_ids,
- u16 offset, u16 count, bool is_rxqs_map)
+static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
+ u16 offset, u16 count)
{
+ struct xps_dev_maps *dev_maps;
bool active = false;
int i, j;
- for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
- j < nr_ids;)
- active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
- count);
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (!dev_maps)
+ return;
+
+ for (j = 0; j < dev_maps->nr_ids; j++)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
if (!active)
- reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ reset_xps_maps(dev, dev_maps, type);
- if (!is_rxqs_map) {
- for (i = offset + (count - 1); count--; i--) {
+ if (type == XPS_CPUS) {
+ for (i = offset + (count - 1); count--; i--)
netdev_queue_numa_node_write(
- netdev_get_tx_queue(dev, i),
- NUMA_NO_NODE);
- }
+ netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
}
}
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
u16 count)
{
- const unsigned long *possible_mask = NULL;
- struct xps_dev_maps *dev_maps;
- unsigned int nr_ids;
-
if (!static_key_false(&xps_needed))
return;
cpus_read_lock();
mutex_lock(&xps_map_mutex);
- if (static_key_false(&xps_rxqs_needed)) {
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
- if (dev_maps) {
- nr_ids = dev->num_rx_queues;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
- offset, count, true);
- }
- }
-
- dev_maps = xmap_dereference(dev->xps_cpus_map);
- if (!dev_maps)
- goto out_no_maps;
+ if (static_key_false(&xps_rxqs_needed))
+ clean_xps_maps(dev, XPS_RXQS, offset, count);
- if (num_possible_cpus() > 1)
- possible_mask = cpumask_bits(cpu_possible_mask);
- nr_ids = nr_cpu_ids;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
- false);
+ clean_xps_maps(dev, XPS_CPUS, offset, count);
-out_no_maps:
mutex_unlock(&xps_map_mutex);
cpus_read_unlock();
}
@@ -2517,16 +2518,35 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
return new_map;
}
+/* Copy xps maps at a given index */
+static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
+ struct xps_dev_maps *new_dev_maps, int index,
+ int tc, bool skip_tc)
+{
+ int i, tci = index * dev_maps->num_tc;
+ struct xps_map *map;
+
+ /* copy maps belonging to foreign traffic classes */
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc && skip_tc)
+ continue;
+
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+}
+
/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
- u16 index, bool is_rxqs_map)
+ u16 index, enum xps_map_type type)
{
- const unsigned long *online_mask = NULL, *possible_mask = NULL;
- struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
+ const unsigned long *online_mask = NULL;
+ bool active = false, copy = false;
int i, j, tci, numa_node_id = -2;
int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
- bool active = false;
unsigned int nr_ids;
if (dev->num_tc) {
@@ -2544,38 +2564,48 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
}
mutex_lock(&xps_map_mutex);
- if (is_rxqs_map) {
+
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (type == XPS_RXQS) {
maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
nr_ids = dev->num_rx_queues;
} else {
maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
- if (num_possible_cpus() > 1) {
+ if (num_possible_cpus() > 1)
online_mask = cpumask_bits(cpu_online_mask);
- possible_mask = cpumask_bits(cpu_possible_mask);
- }
- dev_maps = xmap_dereference(dev->xps_cpus_map);
nr_ids = nr_cpu_ids;
}
if (maps_sz < L1_CACHE_BYTES)
maps_sz = L1_CACHE_BYTES;
+ /* The old dev_maps could be larger or smaller than the one we're
+ * setting up now, as dev->num_tc or nr_ids could have been updated in
+ * between. We could try to be smart, but let's be safe instead and only
+ * copy foreign traffic classes if the two map sizes match.
+ */
+ if (dev_maps &&
+ dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
+ copy = true;
+
/* allocate memory for queue storage */
for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
j < nr_ids;) {
- if (!new_dev_maps)
- new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
- mutex_unlock(&xps_map_mutex);
- return -ENOMEM;
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
+ }
+
+ new_dev_maps->nr_ids = nr_ids;
+ new_dev_maps->num_tc = num_tc;
}
tci = j * num_tc + tc;
- map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
- NULL;
+ map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
- map = expand_xps_map(map, j, index, is_rxqs_map);
+ map = expand_xps_map(map, j, index, type == XPS_RXQS);
if (!map)
goto error;
@@ -2588,29 +2618,21 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
if (!dev_maps) {
/* Increment static keys at most once per type */
static_key_slow_inc_cpuslocked(&xps_needed);
- if (is_rxqs_map)
+ if (type == XPS_RXQS)
static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
}
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- /* copy maps belonging to foreign traffic classes */
- for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ for (j = 0; j < nr_ids; j++) {
+ bool skip_tc = false;
- /* We need to explicitly update tci as prevous loop
- * could break out early if dev_maps is NULL.
- */
tci = j * num_tc + tc;
-
if (netif_attr_test_mask(j, mask, nr_ids) &&
netif_attr_test_online(j, online_mask, nr_ids)) {
/* add tx-queue to CPU/rx-queue maps */
int pos = 0;
+ skip_tc = true;
+
map = xmap_dereference(new_dev_maps->attr_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
@@ -2618,78 +2640,81 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS) {
if (numa_node_id == -2)
numa_node_id = cpu_to_node(j);
else if (numa_node_id != cpu_to_node(j))
numa_node_id = -1;
}
#endif
- } else if (dev_maps) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
- /* copy maps belonging to foreign traffic classes */
- for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ if (copy)
+ xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
+ skip_tc);
}
- if (is_rxqs_map)
- rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
- else
- rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
+ rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
/* Cleanup old maps */
if (!dev_maps)
goto out_no_old_maps;
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = num_tc, tci = j * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
map = xmap_dereference(dev_maps->attr_map[tci]);
- if (map && map != new_map)
- kfree_rcu(map, rcu);
+ if (!map)
+ continue;
+
+ if (copy) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ if (map == new_map)
+ continue;
+ }
+
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
+ kfree_rcu(map, rcu);
}
}
- kfree_rcu(dev_maps, rcu);
+ old_dev_maps = dev_maps;
out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
out_no_new_maps:
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS)
/* update Tx queue numa node */
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
(numa_node_id >= 0) ?
numa_node_id : NUMA_NO_NODE);
- }
if (!dev_maps)
goto out_no_maps;
/* removes tx-queue from unused CPUs/rx-queues */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = tc, tci = j * num_tc; i--; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
- if (!netif_attr_test_mask(j, mask, nr_ids) ||
- !netif_attr_test_online(j, online_mask, nr_ids))
- active |= remove_xps_queue(dev_maps, tci, index);
- for (i = num_tc - tc, tci++; --i; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ tci = j * dev_maps->num_tc;
+
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc &&
+ netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
+ netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
+ continue;
+
+ active |= remove_xps_queue(dev_maps,
+ copy ? old_dev_maps : NULL,
+ tci, index);
+ }
}
+ if (old_dev_maps)
+ kfree_rcu(old_dev_maps, rcu);
+
/* free map if not active */
if (!active)
- reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ reset_xps_maps(dev, dev_maps, type);
out_no_maps:
mutex_unlock(&xps_map_mutex);
@@ -2697,11 +2722,10 @@ out_no_maps:
return 0;
error:
/* remove any maps that we added */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
+ for (j = 0; j < nr_ids; j++) {
for (i = num_tc, tci = j * num_tc; i--; tci++) {
new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
- map = dev_maps ?
+ map = copy ?
xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
if (new_map && new_map != map)
@@ -2722,7 +2746,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
int ret;
cpus_read_lock();
- ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
cpus_read_unlock();
return ret;
@@ -2874,6 +2898,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->num_tc)
netif_setup_tc(dev, txq);
+ dev_qdisc_change_real_num_tx(dev, txq);
+
dev->real_num_tx_queues = txq;
if (disabling) {
@@ -2925,15 +2951,117 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues);
#endif
/**
+ * netif_set_real_num_queues - set actual number of RX and TX queues used
+ * @dev: Network device
+ * @txq: Actual number of TX queues
+ * @rxq: Actual number of RX queues
+ *
+ * Set the real number of both TX and RX queues.
+ * Does nothing if the number of queues is already correct.
+ */
+int netif_set_real_num_queues(struct net_device *dev,
+ unsigned int txq, unsigned int rxq)
+{
+ unsigned int old_rxq = dev->real_num_rx_queues;
+ int err;
+
+ if (txq < 1 || txq > dev->num_tx_queues ||
+ rxq < 1 || rxq > dev->num_rx_queues)
+ return -EINVAL;
+
+ /* Start from increases, so the error path only does decreases -
+ * decreases can't fail.
+ */
+ if (rxq > dev->real_num_rx_queues) {
+ err = netif_set_real_num_rx_queues(dev, rxq);
+ if (err)
+ return err;
+ }
+ if (txq > dev->real_num_tx_queues) {
+ err = netif_set_real_num_tx_queues(dev, txq);
+ if (err)
+ goto undo_rx;
+ }
+ if (rxq < dev->real_num_rx_queues)
+ WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
+ if (txq < dev->real_num_tx_queues)
+ WARN_ON(netif_set_real_num_tx_queues(dev, txq));
+
+ return 0;
+undo_rx:
+ WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
+ return err;
+}
+EXPORT_SYMBOL(netif_set_real_num_queues);
+
+/**
+ * netif_set_tso_max_size() - set the max size of TSO frames supported
+ * @dev: netdev to update
+ * @size: max skb->len of a TSO frame
+ *
+ * Set the limit on the size of TSO super-frames the device can handle.
+ * Unless explicitly set the stack will assume the value of
+ * %GSO_LEGACY_MAX_SIZE.
+ */
+void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
+{
+ dev->tso_max_size = min(GSO_MAX_SIZE, size);
+ if (size < READ_ONCE(dev->gso_max_size))
+ netif_set_gso_max_size(dev, size);
+}
+EXPORT_SYMBOL(netif_set_tso_max_size);
+
+/**
+ * netif_set_tso_max_segs() - set the max number of segs supported for TSO
+ * @dev: netdev to update
+ * @segs: max number of TCP segments
+ *
+ * Set the limit on the number of TCP segments the device can generate from
+ * a single TSO super-frame.
+ * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
+ */
+void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
+{
+ dev->tso_max_segs = segs;
+ if (segs < READ_ONCE(dev->gso_max_segs))
+ netif_set_gso_max_segs(dev, segs);
+}
+EXPORT_SYMBOL(netif_set_tso_max_segs);
+
+/**
+ * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
+ * @to: netdev to update
+ * @from: netdev from which to copy the limits
+ */
+void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
+{
+ netif_set_tso_max_size(to, from->tso_max_size);
+ netif_set_tso_max_segs(to, from->tso_max_segs);
+}
+EXPORT_SYMBOL(netif_inherit_tso_max);
+
+/**
* netif_get_num_default_rss_queues - default number of RSS queues
*
- * This routine should set an upper limit on the number of RSS queues
- * used by default by multiqueue devices.
+ * Default value is the number of physical cores if there are only 1 or 2, or
+ * divided by 2 if there are more.
*/
int netif_get_num_default_rss_queues(void)
{
- return is_kdump_kernel() ?
- 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
+ cpumask_var_t cpus;
+ int cpu, count = 0;
+
+ if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
+ return 1;
+
+ cpumask_copy(cpus, cpu_online_mask);
+ for_each_cpu(cpu, cpus) {
+ ++count;
+ cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
+ }
+ free_cpumask_var(cpus);
+
+ return count > 2 ? DIV_ROUND_UP(count, 2) : count;
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);
@@ -3016,7 +3144,7 @@ EXPORT_SYMBOL(__dev_kfree_skb_irq);
void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
{
- if (in_irq() || irqs_disabled())
+ if (in_hardirq() || irqs_disabled())
__dev_kfree_skb_irq(skb, reason);
else
dev_kfree_skb(skb);
@@ -3072,6 +3200,12 @@ static u16 skb_tx_hash(const struct net_device *dev,
qoffset = sb_dev->tc_to_txq[tc].offset;
qcount = sb_dev->tc_to_txq[tc].count;
+ if (unlikely(!qcount)) {
+ net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
+ sb_dev->name, qoffset, tc);
+ qoffset = 0;
+ qcount = dev->real_num_tx_queues;
+ }
}
if (skb_rx_queue_recorded(skb)) {
@@ -3119,7 +3253,7 @@ int skb_checksum_help(struct sk_buff *skb)
if (skb->ip_summed == CHECKSUM_COMPLETE)
goto out_set_summed;
- if (unlikely(skb_shinfo(skb)->gso_size)) {
+ if (unlikely(skb_is_gso(skb))) {
skb_warn_bad_offload(skb);
return -EINVAL;
}
@@ -3134,12 +3268,18 @@ int skb_checksum_help(struct sk_buff *skb)
}
offset = skb_checksum_start_offset(skb);
- BUG_ON(offset >= skb_headlen(skb));
+ ret = -EINVAL;
+ if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
+ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+ goto out;
+ }
csum = skb_checksum(skb, offset, skb->len - offset, 0);
offset += skb->csum_offset;
- BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
-
+ if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) {
+ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+ goto out;
+ }
ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
if (ret)
goto out;
@@ -3210,40 +3350,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
return __vlan_get_protocol(skb, type, depth);
}
-/**
- * skb_mac_gso_segment - mac layer segmentation handler.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- */
-struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
-{
- struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
- struct packet_offload *ptype;
- int vlan_depth = skb->mac_len;
- __be16 type = skb_network_protocol(skb, &vlan_depth);
-
- if (unlikely(!type))
- return ERR_PTR(-EINVAL);
-
- __skb_pull(skb, vlan_depth);
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, &offload_base, list) {
- if (ptype->type == type && ptype->callbacks.gso_segment) {
- segs = ptype->callbacks.gso_segment(skb, features);
- break;
- }
- }
- rcu_read_unlock();
-
- __skb_push(skb, skb->data - skb_mac_header(skb));
-
- return segs;
-}
-EXPORT_SYMBOL(skb_mac_gso_segment);
-
-
/* openvswitch calls this on rx path, so we need a different check.
*/
static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
@@ -3266,7 +3372,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*
- * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
@@ -3295,7 +3401,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
features &= ~NETIF_F_GSO_PARTIAL;
}
- BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
@@ -3315,13 +3421,16 @@ EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
+static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
+{
+ netdev_err(dev, "hw csum failure\n");
+ skb_dump(KERN_ERR, skb, true);
+ dump_stack();
+}
+
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
- if (net_ratelimit()) {
- pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
- skb_dump(KERN_ERR, skb, true);
- dump_stack();
- }
+ DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
@@ -3369,10 +3478,9 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb,
static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t features)
{
- int tmp;
__be16 type;
- type = skb_network_protocol(skb, &tmp);
+ type = skb_network_protocol(skb, NULL);
features = net_mpls_features(skb, features, type);
if (skb->ip_summed != CHECKSUM_NONE &&
@@ -3406,8 +3514,13 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
{
u16 gso_segs = skb_shinfo(skb)->gso_segs;
- if (gso_segs > dev->gso_max_segs)
+ if (gso_segs > READ_ONCE(dev->gso_max_segs))
+ return features & ~NETIF_F_GSO_MASK;
+
+ if (!skb_shinfo(skb)->gso_type) {
+ skb_warn_bad_offload(skb);
return features & ~NETIF_F_GSO_MASK;
+ }
/* Support for GSO partial features requires software
* intervention before we can actually process the packets
@@ -3520,11 +3633,22 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
int skb_csum_hwoffload_help(struct sk_buff *skb,
const netdev_features_t features)
{
- if (unlikely(skb->csum_not_inet))
+ if (unlikely(skb_csum_is_sctp(skb)))
return !!(features & NETIF_F_SCTP_CRC) ? 0 :
skb_crc32c_csum_help(skb);
- return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
+ if (features & NETIF_F_HW_CSUM)
+ return 0;
+
+ if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
+ switch (skb->csum_offset) {
+ case offsetof(struct tcphdr, check):
+ case offsetof(struct udphdr, check):
+ return 0;
+ }
+ }
+
+ return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
@@ -3579,7 +3703,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
out_kfree_skb:
kfree_skb(skb);
out_null:
- atomic_long_inc(&dev->tx_dropped);
+ dev_core_stats_tx_dropped_inc(dev);
return NULL;
}
@@ -3652,6 +3776,18 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
}
}
+static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
+ struct sk_buff **to_free,
+ struct netdev_queue *txq)
+{
+ int rc;
+
+ rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
+ if (rc == NET_XMIT_SUCCESS)
+ trace_qdisc_enqueue(q, txq, skb);
+ return rc;
+}
+
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
@@ -3664,11 +3800,35 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
if (q->flags & TCQ_F_NOLOCK) {
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
+ qdisc_run_begin(q)) {
+ /* Retest nolock_qdisc_is_empty() within the protection
+ * of q->seqlock to protect from racing with requeuing.
+ */
+ if (unlikely(!nolock_qdisc_is_empty(q))) {
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ __qdisc_run(q);
+ qdisc_run_end(q);
+
+ goto no_lock_out;
+ }
+
+ qdisc_bstats_cpu_update(q, skb);
+ if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
+ !nolock_qdisc_is_empty(q))
+ __qdisc_run(q);
+
+ qdisc_run_end(q);
+ return NET_XMIT_SUCCESS;
+ }
+
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
qdisc_run(q);
+no_lock_out:
if (unlikely(to_free))
- kfree_skb_list(to_free);
+ kfree_skb_list_reason(to_free,
+ SKB_DROP_REASON_QDISC_DROP);
return rc;
}
@@ -3677,8 +3837,12 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
* separate lock before trying to get qdisc main lock.
* This permits qdisc->running owner to get the lock more
* often and dequeue packets faster.
+ * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
+ * and then other tasks will only enqueue packets. The packets will be
+ * sent after the qdisc owner is scheduled again. To prevent this
+ * scenario the task always serialize on the lock.
*/
- contended = qdisc_is_running(q);
+ contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
if (unlikely(contended))
spin_lock(&q->busylock);
@@ -3707,7 +3871,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -3719,7 +3883,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
}
spin_unlock(root_lock);
if (unlikely(to_free))
- kfree_skb_list(to_free);
+ kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
if (unlikely(contended))
spin_unlock(&q->busylock);
return rc;
@@ -3761,10 +3925,11 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
skb_reset_mac_header(skb);
__skb_pull(skb, skb_network_offset(skb));
skb->pkt_type = PACKET_LOOPBACK;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- WARN_ON(!skb_dst(skb));
+ if (skb->ip_summed == CHECKSUM_NONE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
skb_dst_force(skb);
- netif_rx_ni(skb);
+ netif_rx(skb);
return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);
@@ -3773,6 +3938,7 @@ EXPORT_SYMBOL(dev_loopback_xmit);
static struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
+#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
struct tcf_result cl_res;
@@ -3780,9 +3946,11 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
return skb;
/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
+ tc_skb_cb(skb)->mru = 0;
+ tc_skb_cb(skb)->post_ct = false;
mini_qdisc_bstats_cpu_update(miniq, skb);
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+ switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
@@ -3790,7 +3958,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
case TC_ACT_SHOT:
mini_qdisc_qstats_cpu_drop(miniq);
*ret = NET_XMIT_DROP;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
@@ -3806,22 +3974,44 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
default:
break;
}
+#endif /* CONFIG_NET_CLS_ACT */
return skb;
}
+
+static struct netdev_queue *
+netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
+{
+ int qm = skb_get_queue_mapping(skb);
+
+ return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
+}
+
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return __this_cpu_read(softnet_data.xmit.skip_txqueue);
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
+{
+ __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif /* CONFIG_NET_EGRESS */
#ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
struct xps_dev_maps *dev_maps, unsigned int tci)
{
+ int tc = netdev_get_prio_tc_map(dev, skb->priority);
struct xps_map *map;
int queue_index = -1;
- if (dev->num_tc) {
- tci *= dev->num_tc;
- tci += netdev_get_prio_tc_map(dev, skb->priority);
- }
+ if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
+ return queue_index;
+
+ tci *= dev_maps->num_tc;
+ tci += tc;
map = rcu_dereference(dev_maps->attr_map[tci]);
if (map) {
@@ -3852,18 +4042,18 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
if (!static_key_false(&xps_rxqs_needed))
goto get_cpus_map;
- dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
if (dev_maps) {
int tci = sk_rx_queue_get(sk);
- if (tci >= 0 && tci < dev->num_rx_queues)
+ if (tci >= 0)
queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
tci);
}
get_cpus_map:
if (queue_index < 0) {
- dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
if (dev_maps) {
unsigned int tci = skb->sender_cpu - 1;
@@ -3949,43 +4139,39 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
}
/**
- * __dev_queue_xmit - transmit a buffer
- * @skb: buffer to transmit
- * @sb_dev: suboordinate device used for L2 forwarding offload
+ * __dev_queue_xmit() - transmit a buffer
+ * @skb: buffer to transmit
+ * @sb_dev: suboordinate device used for L2 forwarding offload
*
- * Queue a buffer for transmission to a network device. The caller must
- * have set the device and priority and built the buffer before calling
- * this function. The function can be called from an interrupt.
+ * Queue a buffer for transmission to a network device. The caller must
+ * have set the device and priority and built the buffer before calling
+ * this function. The function can be called from an interrupt.
*
- * A negative errno code is returned on a failure. A success does not
- * guarantee the frame will be transmitted as it may be dropped due
- * to congestion or traffic shaping.
+ * When calling this method, interrupts MUST be enabled. This is because
+ * the BH enable code must have IRQs enabled so that it will not deadlock.
*
- * -----------------------------------------------------------------------------------
- * I notice this method can also return errors from the queue disciplines,
- * including NET_XMIT_DROP, which is a positive value. So, errors can also
- * be positive.
+ * Regardless of the return value, the skb is consumed, so it is currently
+ * difficult to retry a send to this method. (You can bump the ref count
+ * before sending to hold a reference for retry if you are careful.)
*
- * Regardless of the return value, the skb is consumed, so it is currently
- * difficult to retry a send to this method. (You can bump the ref count
- * before sending to hold a reference for retry if you are careful.)
- *
- * When calling this method, interrupts MUST be enabled. This is because
- * the BH enable code must have IRQs enabled so that it will not deadlock.
- * --BLG
+ * Return:
+ * * 0 - buffer successfully transmitted
+ * * positive qdisc return code - NET_XMIT_DROP etc.
+ * * negative errno - other errors
*/
-static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
+int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
- struct netdev_queue *txq;
+ struct netdev_queue *txq = NULL;
struct Qdisc *q;
int rc = -ENOMEM;
bool again = false;
skb_reset_mac_header(skb);
+ skb_assert_len(skb);
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
- __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
@@ -3997,13 +4183,26 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
skb->tc_at_ingress = 0;
-# ifdef CONFIG_NET_EGRESS
+#endif
+#ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
+ if (nf_hook_egress_active()) {
+ skb = nf_hook_egress(skb, &rc, dev);
+ if (!skb)
+ goto out;
+ }
+
+ netdev_xmit_skip_txqueue(false);
+
+ nf_skip_egress(skb, true);
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
+ nf_skip_egress(skb, false);
+
+ if (netdev_xmit_txqueue_skipped())
+ txq = netdev_tx_queue_mapping(dev, skb);
}
-# endif
#endif
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
@@ -4013,7 +4212,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
else
skb_dst_force(skb);
- txq = netdev_core_pick_tx(dev, skb, sb_dev);
+ if (!txq)
+ txq = netdev_core_pick_tx(dev, skb, sb_dev);
+
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb);
@@ -4037,7 +4238,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
- if (txq->xmit_lock_owner != cpu) {
+ /* Other cpus might concurrently change txq->xmit_lock_owner
+ * to -1 or to their cpu id, but not to our id.
+ */
+ if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
if (dev_xmit_recursion())
goto recursion_alert;
@@ -4072,27 +4276,16 @@ recursion_alert:
rc = -ENETDOWN;
rcu_read_unlock_bh();
- atomic_long_inc(&dev->tx_dropped);
+ dev_core_stats_tx_dropped_inc(dev);
kfree_skb_list(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
+EXPORT_SYMBOL(__dev_queue_xmit);
-int dev_queue_xmit(struct sk_buff *skb)
-{
- return __dev_queue_xmit(skb, NULL);
-}
-EXPORT_SYMBOL(dev_queue_xmit);
-
-int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
-{
- return __dev_queue_xmit(skb, sb_dev);
-}
-EXPORT_SYMBOL(dev_queue_xmit_accel);
-
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
struct net_device *dev = skb->dev;
struct sk_buff *orig_skb = skb;
@@ -4113,23 +4306,21 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
local_bh_disable();
+ dev_xmit_recursion_inc();
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_drv_stopped(txq))
ret = netdev_start_xmit(skb, dev, txq, false);
HARD_TX_UNLOCK(dev, txq);
+ dev_xmit_recursion_dec();
local_bh_enable();
-
- if (!dev_xmit_complete(ret))
- kfree_skb(skb);
-
return ret;
drop:
- atomic_long_inc(&dev->tx_dropped);
+ dev_core_stats_tx_dropped_inc(dev);
kfree_skb_list(skb);
return NET_XMIT_DROP;
}
-EXPORT_SYMBOL(dev_direct_xmit);
+EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
@@ -4139,20 +4330,45 @@ int netdev_max_backlog __read_mostly = 1000;
EXPORT_SYMBOL(netdev_max_backlog);
int netdev_tstamp_prequeue __read_mostly = 1;
+unsigned int sysctl_skb_defer_max __read_mostly = 64;
int netdev_budget __read_mostly = 300;
-unsigned int __read_mostly netdev_budget_usecs = 2000;
+/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
+unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
int weight_p __read_mostly = 64; /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
int dev_rx_weight __read_mostly = 64;
int dev_tx_weight __read_mostly = 64;
-/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
-int gro_normal_batch __read_mostly = 8;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
+ struct task_struct *thread;
+
+ lockdep_assert_irqs_disabled();
+
+ if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+ /* Paired with smp_mb__before_atomic() in
+ * napi_enable()/dev_set_threaded().
+ * Use READ_ONCE() to guarantee a complete
+ * read on napi->thread. Only call
+ * wake_up_process() when it's not NULL.
+ */
+ thread = READ_ONCE(napi->thread);
+ if (thread) {
+ /* Avoid doing set_bit() if the thread is in
+ * INTERRUPTIBLE state, cause napi_thread_wait()
+ * makes sure to proceed with napi polling
+ * if the thread is explicitly woken from here.
+ */
+ if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+ wake_up_process(thread);
+ return;
+ }
+ }
+
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
@@ -4366,16 +4582,25 @@ static void rps_trigger_softirq(void *data)
#endif /* CONFIG_RPS */
+/* Called from hardirq (IPI) context */
+static void trigger_rx_softirq(void *data)
+{
+ struct softnet_data *sd = data;
+
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ smp_store_release(&sd->defer_ipi_scheduled, 0);
+}
+
/*
* Check if this softnet_data structure is another cpu one
* If yes, queue it to our IPI list and return 1
* If no, return 0
*/
-static int rps_ipi_queued(struct softnet_data *sd)
+static int napi_schedule_rps(struct softnet_data *sd)
{
-#ifdef CONFIG_RPS
struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
+#ifdef CONFIG_RPS
if (sd != mysd) {
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
@@ -4384,6 +4609,7 @@ static int rps_ipi_queued(struct softnet_data *sd)
return 1;
}
#endif /* CONFIG_RPS */
+ __napi_schedule_irqoff(&mysd->backlog);
return 0;
}
@@ -4398,7 +4624,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
struct softnet_data *sd;
unsigned int old_flow, new_flow;
- if (qlen < (netdev_max_backlog >> 1))
+ if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
return false;
sd = this_cpu_ptr(&softnet_data);
@@ -4434,46 +4660,42 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
+ enum skb_drop_reason reason;
struct softnet_data *sd;
unsigned long flags;
unsigned int qlen;
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
sd = &per_cpu(softnet_data, cpu);
- local_irq_save(flags);
-
- rps_lock(sd);
+ rps_lock_irqsave(sd, &flags);
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
input_queue_tail_incr_save(sd, qtail);
- rps_unlock(sd);
- local_irq_restore(flags);
+ rps_unlock_irq_restore(sd, &flags);
return NET_RX_SUCCESS;
}
/* Schedule NAPI for backlog device
* We can use non atomic operation since we own the queue lock
*/
- if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
- if (!rps_ipi_queued(sd))
- ____napi_schedule(sd, &sd->backlog);
- }
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+ napi_schedule_rps(sd);
goto enqueue;
}
+ reason = SKB_DROP_REASON_CPU_BACKLOG;
drop:
sd->dropped++;
- rps_unlock(sd);
-
- local_irq_restore(flags);
+ rps_unlock_irq_restore(sd, &flags);
- atomic_long_inc(&skb->dev->rx_dropped);
- kfree_skb(skb);
+ dev_core_stats_rx_dropped_inc(skb->dev);
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
@@ -4500,63 +4722,40 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
return rxqueue;
}
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
- struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
{
+ void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
- void *orig_data, *orig_data_end;
- u32 metalen, act = XDP_DROP;
+ bool orig_bcast, orig_host;
+ u32 mac_len, frame_sz;
__be16 orig_eth_type;
struct ethhdr *eth;
- bool orig_bcast;
- int hlen, off;
- u32 mac_len;
-
- /* Reinjected packets coming from act_mirred or similar should
- * not get XDP generic processing.
- */
- if (skb_is_tc_redirected(skb))
- return XDP_PASS;
-
- /* XDP packets must be linear and must have sufficient headroom
- * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
- * native XDP provides, thus we need to do it here as well.
- */
- if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
- skb_headroom(skb) < XDP_PACKET_HEADROOM) {
- int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
- int troom = skb->tail + skb->data_len - skb->end;
-
- /* In case we have to go down the path and also linearize,
- * then lets do the pskb_expand_head() work just once here.
- */
- if (pskb_expand_head(skb,
- hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
- troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
- goto do_drop;
- if (skb_linearize(skb))
- goto do_drop;
- }
+ u32 metalen, act;
+ int off;
/* The XDP program wants to see the packet starting at the MAC
* header.
*/
mac_len = skb->data - skb_mac_header(skb);
- hlen = skb_headlen(skb) + mac_len;
- xdp->data = skb->data - mac_len;
- xdp->data_meta = xdp->data;
- xdp->data_end = xdp->data + hlen;
- xdp->data_hard_start = skb->data - skb_headroom(skb);
+ hard_start = skb->data - skb_headroom(skb);
+
+ /* SKB "head" area always have tailroom for skb_shared_info */
+ frame_sz = (void *)skb_end_pointer(skb) - hard_start;
+ frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ rxqueue = netif_get_rxqueue(skb);
+ xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
+ xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
+ skb_headlen(skb) + mac_len, true);
+
orig_data_end = xdp->data_end;
orig_data = xdp->data;
eth = (struct ethhdr *)xdp->data;
+ orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
orig_eth_type = eth->h_proto;
- rxqueue = netif_get_rxqueue(skb);
- xdp->rxq = &rxqueue->xdp_rxq;
-
act = bpf_prog_run_xdp(xdp_prog, xdp);
/* check if bpf_xdp_adjust_head was used */
@@ -4571,24 +4770,31 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
skb_reset_network_header(skb);
}
- /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
- * pckt.
- */
- off = orig_data_end - xdp->data_end;
+ /* check if bpf_xdp_adjust_tail was used */
+ off = xdp->data_end - orig_data_end;
if (off != 0) {
skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
- skb->len -= off;
-
+ skb->len += off; /* positive on grow, negative on shrink */
}
/* check if XDP changed eth hdr such SKB needs update */
eth = (struct ethhdr *)xdp->data;
if ((orig_eth_type != eth->h_proto) ||
+ (orig_host != ether_addr_equal_64bits(eth->h_dest,
+ skb->dev->dev_addr)) ||
(orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
__skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, skb->dev);
}
+ /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
+ * before calling us again on redirect path. We do not call do_redirect
+ * as we leave that up to the caller.
+ *
+ * Caller is responsible for managing lifetime of skb (i.e. calling
+ * kfree_skb in response to actions it cannot handle/XDP_DROP).
+ */
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
@@ -4599,12 +4805,55 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
if (metalen)
skb_metadata_set(skb, metalen);
break;
+ }
+
+ return act;
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+ struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ u32 act = XDP_DROP;
+
+ /* Reinjected packets coming from act_mirred or similar should
+ * not get XDP generic processing.
+ */
+ if (skb_is_redirected(skb))
+ return XDP_PASS;
+
+ /* XDP packets must be linear and must have sufficient headroom
+ * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+ * native XDP provides, thus we need to do it here as well.
+ */
+ if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
+ skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+ int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+ int troom = skb->tail + skb->data_len - skb->end;
+
+ /* In case we have to go down the path and also linearize,
+ * then lets do the pskb_expand_head() work just once here.
+ */
+ if (pskb_expand_head(skb,
+ hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+ troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+ goto do_drop;
+ if (skb_linearize(skb))
+ goto do_drop;
+ }
+
+ act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+ switch (act) {
+ case XDP_REDIRECT:
+ case XDP_TX:
+ case XDP_PASS:
+ break;
default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
+ bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
+ fallthrough;
case XDP_ABORTED:
trace_xdp_exception(skb->dev, xdp_prog, act);
- /* fall through */
+ fallthrough;
case XDP_DROP:
do_drop:
kfree_skb(skb);
@@ -4615,7 +4864,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
}
/* When doing generic XDP we have to bypass the qdisc layer and the
- * network taps in order to match in-driver-XDP behavior.
+ * network taps in order to match in-driver-XDP behavior. This also means
+ * that XDP packets are able to starve other packets going through a qdisc,
+ * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
+ * queues, so they do not have this starvation issue.
*/
void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
{
@@ -4627,7 +4879,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
txq = netdev_core_pick_tx(dev, skb, NULL);
cpu = smp_processor_id();
HARD_TX_LOCK(dev, txq, cpu);
- if (!netif_xmit_stopped(txq)) {
+ if (!netif_xmit_frozen_or_drv_stopped(txq)) {
rc = netdev_start_xmit(skb, dev, txq, 0);
if (dev_xmit_complete(rc))
free_skb = false;
@@ -4635,10 +4887,10 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
HARD_TX_UNLOCK(dev, txq);
if (free_skb) {
trace_xdp_exception(dev, xdp_prog, XDP_TX);
+ dev_core_stats_tx_dropped_inc(dev);
kfree_skb(skb);
}
}
-EXPORT_SYMBOL_GPL(generic_xdp_tx);
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
@@ -4667,7 +4919,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
}
return XDP_PASS;
out_redir:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
return XDP_DROP;
}
EXPORT_SYMBOL_GPL(do_xdp_generic);
@@ -4676,7 +4928,7 @@ static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_rx(skb);
@@ -4685,7 +4937,6 @@ static int netif_rx_internal(struct sk_buff *skb)
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
- preempt_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -4695,63 +4946,72 @@ static int netif_rx_internal(struct sk_buff *skb)
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- preempt_enable();
} else
#endif
{
unsigned int qtail;
- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
- put_cpu();
+ ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
}
return ret;
}
/**
+ * __netif_rx - Slightly optimized version of netif_rx
+ * @skb: buffer to post
+ *
+ * This behaves as netif_rx except that it does not disable bottom halves.
+ * As a result this function may only be invoked from the interrupt context
+ * (either hard or soft interrupt).
+ */
+int __netif_rx(struct sk_buff *skb)
+{
+ int ret;
+
+ lockdep_assert_once(hardirq_count() | softirq_count());
+
+ trace_netif_rx_entry(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+ return ret;
+}
+EXPORT_SYMBOL(__netif_rx);
+
+/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
- * the upper (protocol) levels to process. It always succeeds. The buffer
- * may be dropped during processing for congestion control or by the
- * protocol layers.
+ * the upper (protocol) levels to process via the backlog NAPI device. It
+ * always succeeds. The buffer may be dropped during processing for
+ * congestion control or by the protocol layers.
+ * The network buffer is passed via the backlog NAPI device. Modern NIC
+ * driver should use NAPI and GRO.
+ * This function can used from interrupt and from process context. The
+ * caller from process context must not disable interrupts before invoking
+ * this function.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
-
int netif_rx(struct sk_buff *skb)
{
+ bool need_bh_off = !(hardirq_count() | softirq_count());
int ret;
+ if (need_bh_off)
+ local_bh_disable();
trace_netif_rx_entry(skb);
-
ret = netif_rx_internal(skb);
trace_netif_rx_exit(ret);
-
+ if (need_bh_off)
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL(netif_rx);
-int netif_rx_ni(struct sk_buff *skb)
-{
- int err;
-
- trace_netif_rx_ni_entry(skb);
-
- preempt_disable();
- err = netif_rx_internal(skb);
- if (local_softirq_pending())
- do_softirq();
- preempt_enable();
- trace_netif_rx_ni_exit(err);
-
- return err;
-}
-EXPORT_SYMBOL(netif_rx_ni);
-
static __latent_entropy void net_tx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -4773,15 +5033,14 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
trace_consume_skb(skb);
else
- trace_kfree_skb(skb, net_tx_action);
+ trace_kfree_skb(skb, net_tx_action,
+ SKB_DROP_REASON_NOT_SPECIFIED);
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
__kfree_skb_defer(skb);
}
-
- __kfree_skb_flush();
}
if (sd->output_queue) {
@@ -4793,25 +5052,43 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
+ rcu_read_lock();
+
while (head) {
struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
head = head->next_sched;
- if (!(q->flags & TCQ_F_NOLOCK)) {
- root_lock = qdisc_lock(q);
- spin_lock(root_lock);
- }
/* We need to make sure head->next_sched is read
* before clearing __QDISC_STATE_SCHED
*/
smp_mb__before_atomic();
+
+ if (!(q->flags & TCQ_F_NOLOCK)) {
+ root_lock = qdisc_lock(q);
+ spin_lock(root_lock);
+ } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
+ &q->state))) {
+ /* There is a synchronize_net() between
+ * STATE_DEACTIVATED flag being set and
+ * qdisc_reset()/some_qdisc_is_busy() in
+ * dev_deactivate(), so we can safely bail out
+ * early here to avoid data race between
+ * qdisc_deactivate() and some_qdisc_is_busy()
+ * for lockless qdisc.
+ */
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
+ continue;
+ }
+
clear_bit(__QDISC_STATE_SCHED, &q->state);
qdisc_run(q);
if (root_lock)
spin_unlock(root_lock);
}
+
+ rcu_read_unlock();
}
xfrm_dev_backlog(sd);
@@ -4826,7 +5103,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
- struct net_device *orig_dev)
+ struct net_device *orig_dev, bool *another)
{
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
@@ -4846,22 +5123,26 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
}
qdisc_skb_cb(skb)->pkt_len = skb->len;
+ tc_skb_cb(skb)->mru = 0;
+ tc_skb_cb(skb)->post_ct = false;
skb->tc_at_ingress = 1;
mini_qdisc_bstats_cpu_update(miniq, skb);
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+ switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
break;
case TC_ACT_SHOT:
mini_qdisc_qstats_cpu_drop(miniq);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+ *ret = NET_RX_DROP;
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
consume_skb(skb);
+ *ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_REDIRECT:
/* skb_mac_header check was done by cls/act_bpf, so
@@ -4869,9 +5150,15 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
- skb_do_redirect(skb);
+ if (skb_do_redirect(skb) == -EAGAIN) {
+ __skb_pull(skb, skb->mac_len);
+ *another = true;
+ break;
+ }
+ *ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_CONSUMED:
+ *ret = NET_RX_SUCCESS;
return NULL;
default:
break;
@@ -4987,17 +5274,18 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
return 0;
}
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
+ struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
- net_timestamp_check(!netdev_tstamp_prequeue, skb);
+ net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_receive_skb(skb);
@@ -5018,17 +5306,17 @@ another_round:
if (static_branch_unlikely(&generic_xdp_needed_key)) {
int ret2;
- preempt_disable();
+ migrate_disable();
ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
- preempt_enable();
+ migrate_enable();
- if (ret2 != XDP_PASS)
- return NET_RX_DROP;
- skb_reset_mac_len(skb);
+ if (ret2 != XDP_PASS) {
+ ret = NET_RX_DROP;
+ goto out;
+ }
}
- if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
- skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ if (eth_type_vlan(skb->protocol)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
@@ -5055,15 +5343,22 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
- skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+ bool another = false;
+
+ nf_skip_egress(skb, true);
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
+ &another);
+ if (another)
+ goto another_round;
if (!skb)
goto out;
+ nf_skip_egress(skb, false);
if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
goto out;
}
#endif
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
@@ -5093,6 +5388,7 @@ skip_classify:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
+ break;
case RX_HANDLER_PASS:
break;
default:
@@ -5100,15 +5396,14 @@ skip_classify:
}
}
- if (unlikely(skb_vlan_tag_present(skb))) {
+ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
if (skb_vlan_tag_get_id(skb)) {
/* Vlan id is non 0 and vlan_do_receive() above couldn't
* find vlan device.
*/
skb->pkt_type = PACKET_OTHERHOST;
- } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
- skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ } else if (eth_type_vlan(skb->protocol)) {
/* Outer header is 802.1P with vlan 0, inner header is
* 802.1Q or 802.1AD and vlan_do_receive() above could
* not find vlan dev for vlan id 0.
@@ -5162,10 +5457,10 @@ check_vlan_id:
} else {
drop:
if (!deliver_exact)
- atomic_long_inc(&skb->dev->rx_dropped);
+ dev_core_stats_rx_dropped_inc(skb->dev);
else
- atomic_long_inc(&skb->dev->rx_nohandler);
- kfree_skb(skb);
+ dev_core_stats_rx_nohandler_inc(skb->dev);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
@@ -5173,6 +5468,13 @@ drop:
}
out:
+ /* The invariant here is that if *ppt_prev is not NULL
+ * then skb should also be non-NULL.
+ *
+ * Apparently *ppt_prev assignment above holds this invariant due to
+ * skb dereferencing near it.
+ */
+ *pskb = skb;
return ret;
}
@@ -5182,7 +5484,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
struct packet_type *pt_prev = NULL;
int ret;
- ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
skb->dev, pt_prev, orig_dev);
@@ -5195,7 +5497,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
*
* More direct receive version of netif_receive_skb(). It should
* only be used by callers that have a need to skip RPS and Generic XDP.
- * Caller must also take care of handling if (page_is_)pfmemalloc.
+ * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
@@ -5260,7 +5562,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
struct packet_type *pt_prev = NULL;
skb_list_del_init(skb);
- __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (!pt_prev)
continue;
if (pt_curr != pt_prev || od_curr != orig_dev) {
@@ -5354,10 +5656,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
}
break;
- case XDP_QUERY_PROG:
- xdp->prog_id = old ? old->aux->id : 0;
- break;
-
default:
ret = -EINVAL;
break;
@@ -5370,7 +5668,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
@@ -5393,14 +5691,14 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
return ret;
}
-static void netif_receive_skb_list_internal(struct list_head *head)
+void netif_receive_skb_list_internal(struct list_head *head)
{
struct sk_buff *skb, *next;
struct list_head sublist;
INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
skb_list_del_init(skb);
if (!skb_defer_rx_timestamp(skb))
list_add_tail(&skb->list, &sublist);
@@ -5479,7 +5777,7 @@ void netif_receive_skb_list(struct list_head *head)
}
EXPORT_SYMBOL(netif_receive_skb_list);
-DEFINE_PER_CPU(struct work_struct, flush_works);
+static DEFINE_PER_CPU(struct work_struct, flush_works);
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
@@ -5490,17 +5788,15 @@ static void flush_backlog(struct work_struct *work)
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
- local_irq_disable();
- rps_lock(sd);
+ rps_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
- kfree_skb(skb);
+ dev_kfree_skb_irq(skb);
input_queue_head_incr(sd);
}
}
- rps_unlock(sd);
- local_irq_enable();
+ rps_unlock_irq_enable(sd);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev->reg_state == NETREG_UNREGISTERING) {
@@ -5512,560 +5808,62 @@ static void flush_backlog(struct work_struct *work)
local_bh_enable();
}
-static void flush_all_backlogs(void)
-{
- unsigned int cpu;
-
- get_online_cpus();
-
- for_each_online_cpu(cpu)
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
-
- for_each_online_cpu(cpu)
- flush_work(per_cpu_ptr(&flush_works, cpu));
-
- put_online_cpus();
-}
-
-/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
-static void gro_normal_list(struct napi_struct *napi)
-{
- if (!napi->rx_count)
- return;
- netif_receive_skb_list_internal(&napi->rx_list);
- INIT_LIST_HEAD(&napi->rx_list);
- napi->rx_count = 0;
-}
-
-/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
- * pass the whole batch up to the stack.
- */
-static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
-{
- list_add_tail(&skb->list, &napi->rx_list);
- if (++napi->rx_count >= gro_normal_batch)
- gro_normal_list(napi);
-}
-
-INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
-INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
-static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
-{
- struct packet_offload *ptype;
- __be16 type = skb->protocol;
- struct list_head *head = &offload_base;
- int err = -ENOENT;
-
- BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
-
- if (NAPI_GRO_CB(skb)->count == 1) {
- skb_shinfo(skb)->gso_size = 0;
- goto out;
- }
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_complete)
- continue;
-
- err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
- ipv6_gro_complete, inet_gro_complete,
- skb, 0);
- break;
- }
- rcu_read_unlock();
-
- if (err) {
- WARN_ON(&ptype->list == head);
- kfree_skb(skb);
- return NET_RX_SUCCESS;
- }
-
-out:
- gro_normal_one(napi, skb);
- return NET_RX_SUCCESS;
-}
-
-static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
- bool flush_old)
-{
- struct list_head *head = &napi->gro_hash[index].list;
- struct sk_buff *skb, *p;
-
- list_for_each_entry_safe_reverse(skb, p, head, list) {
- if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
- return;
- skb_list_del_init(skb);
- napi_gro_complete(napi, skb);
- napi->gro_hash[index].count--;
- }
-
- if (!napi->gro_hash[index].count)
- __clear_bit(index, &napi->gro_bitmask);
-}
-
-/* napi->gro_hash[].list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
-{
- unsigned long bitmask = napi->gro_bitmask;
- unsigned int i, base = ~0U;
-
- while ((i = ffs(bitmask)) != 0) {
- bitmask >>= i;
- base += i;
- __napi_gro_flush_chain(napi, base, flush_old);
- }
-}
-EXPORT_SYMBOL(napi_gro_flush);
-
-static struct list_head *gro_list_prepare(struct napi_struct *napi,
- struct sk_buff *skb)
-{
- unsigned int maclen = skb->dev->hard_header_len;
- u32 hash = skb_get_hash_raw(skb);
- struct list_head *head;
- struct sk_buff *p;
-
- head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
- list_for_each_entry(p, head, list) {
- unsigned long diffs;
-
- NAPI_GRO_CB(p)->flush = 0;
-
- if (hash != skb_get_hash_raw(p)) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
-
- diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
- diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
- if (skb_vlan_tag_present(p))
- diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
- diffs |= skb_metadata_dst_cmp(p, skb);
- diffs |= skb_metadata_differs(p, skb);
- if (maclen == ETH_HLEN)
- diffs |= compare_ether_header(skb_mac_header(p),
- skb_mac_header(skb));
- else if (!diffs)
- diffs = memcmp(skb_mac_header(p),
- skb_mac_header(skb),
- maclen);
- NAPI_GRO_CB(p)->same_flow = !diffs;
- }
-
- return head;
-}
-
-static void skb_gro_reset_offset(struct sk_buff *skb)
-{
- const struct skb_shared_info *pinfo = skb_shinfo(skb);
- const skb_frag_t *frag0 = &pinfo->frags[0];
-
- NAPI_GRO_CB(skb)->data_offset = 0;
- NAPI_GRO_CB(skb)->frag0 = NULL;
- NAPI_GRO_CB(skb)->frag0_len = 0;
-
- if (!skb_headlen(skb) && pinfo->nr_frags &&
- !PageHighMem(skb_frag_page(frag0))) {
- NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
- NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
- skb_frag_size(frag0),
- skb->end - skb->tail);
- }
-}
-
-static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
-{
- struct skb_shared_info *pinfo = skb_shinfo(skb);
-
- BUG_ON(skb->end - skb->tail < grow);
-
- memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
-
- skb->data_len -= grow;
- skb->tail += grow;
-
- skb_frag_off_add(&pinfo->frags[0], grow);
- skb_frag_size_sub(&pinfo->frags[0], grow);
-
- if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
- skb_frag_unref(skb, 0);
- memmove(pinfo->frags, pinfo->frags + 1,
- --pinfo->nr_frags * sizeof(pinfo->frags[0]));
- }
-}
-
-static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
+static bool flush_required(int cpu)
{
- struct sk_buff *oldest;
+#if IS_ENABLED(CONFIG_RPS)
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+ bool do_flush;
- oldest = list_last_entry(head, struct sk_buff, list);
+ rps_lock_irq_disable(sd);
- /* We are called with head length >= MAX_GRO_SKBS, so this is
- * impossible.
+ /* as insertion into process_queue happens with the rps lock held,
+ * process_queue access may race only with dequeue
*/
- if (WARN_ON_ONCE(!oldest))
- return;
+ do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
+ !skb_queue_empty_lockless(&sd->process_queue);
+ rps_unlock_irq_enable(sd);
- /* Do not adjust napi->gro_hash[].count, caller is adding a new
- * SKB to the chain.
+ return do_flush;
+#endif
+ /* without RPS we can't safely check input_pkt_queue: during a
+ * concurrent remote skb_queue_splice() we can detect as empty both
+ * input_pkt_queue and process_queue even if the latter could end-up
+ * containing a lot of packets.
*/
- skb_list_del_init(oldest);
- napi_gro_complete(napi, oldest);
-}
-
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
- struct sk_buff *));
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
- struct sk_buff *));
-static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-{
- u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
- struct list_head *head = &offload_base;
- struct packet_offload *ptype;
- __be16 type = skb->protocol;
- struct list_head *gro_head;
- struct sk_buff *pp = NULL;
- enum gro_result ret;
- int same_flow;
- int grow;
-
- if (netif_elide_gro(skb->dev))
- goto normal;
-
- gro_head = gro_list_prepare(napi, skb);
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_receive)
- continue;
-
- skb_set_network_header(skb, skb_gro_offset(skb));
- skb_reset_mac_len(skb);
- NAPI_GRO_CB(skb)->same_flow = 0;
- NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
- NAPI_GRO_CB(skb)->free = 0;
- NAPI_GRO_CB(skb)->encap_mark = 0;
- NAPI_GRO_CB(skb)->recursion_counter = 0;
- NAPI_GRO_CB(skb)->is_fou = 0;
- NAPI_GRO_CB(skb)->is_atomic = 1;
- NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
-
- /* Setup for GRO checksum validation */
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- NAPI_GRO_CB(skb)->csum = skb->csum;
- NAPI_GRO_CB(skb)->csum_valid = 1;
- NAPI_GRO_CB(skb)->csum_cnt = 0;
- break;
- case CHECKSUM_UNNECESSARY:
- NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
- NAPI_GRO_CB(skb)->csum_valid = 0;
- break;
- default:
- NAPI_GRO_CB(skb)->csum_cnt = 0;
- NAPI_GRO_CB(skb)->csum_valid = 0;
- }
-
- pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
- ipv6_gro_receive, inet_gro_receive,
- gro_head, skb);
- break;
- }
- rcu_read_unlock();
-
- if (&ptype->list == head)
- goto normal;
-
- if (PTR_ERR(pp) == -EINPROGRESS) {
- ret = GRO_CONSUMED;
- goto ok;
- }
-
- same_flow = NAPI_GRO_CB(skb)->same_flow;
- ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
-
- if (pp) {
- skb_list_del_init(pp);
- napi_gro_complete(napi, pp);
- napi->gro_hash[hash].count--;
- }
-
- if (same_flow)
- goto ok;
-
- if (NAPI_GRO_CB(skb)->flush)
- goto normal;
-
- if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
- gro_flush_oldest(napi, gro_head);
- } else {
- napi->gro_hash[hash].count++;
- }
- NAPI_GRO_CB(skb)->count = 1;
- NAPI_GRO_CB(skb)->age = jiffies;
- NAPI_GRO_CB(skb)->last = skb;
- skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- list_add(&skb->list, gro_head);
- ret = GRO_HELD;
-
-pull:
- grow = skb_gro_offset(skb) - skb_headlen(skb);
- if (grow > 0)
- gro_pull_from_frag0(skb, grow);
-ok:
- if (napi->gro_hash[hash].count) {
- if (!test_bit(hash, &napi->gro_bitmask))
- __set_bit(hash, &napi->gro_bitmask);
- } else if (test_bit(hash, &napi->gro_bitmask)) {
- __clear_bit(hash, &napi->gro_bitmask);
- }
-
- return ret;
-
-normal:
- ret = GRO_NORMAL;
- goto pull;
-}
-
-struct packet_offload *gro_find_receive_by_type(__be16 type)
-{
- struct list_head *offload_head = &offload_base;
- struct packet_offload *ptype;
-
- list_for_each_entry_rcu(ptype, offload_head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_receive)
- continue;
- return ptype;
- }
- return NULL;
-}
-EXPORT_SYMBOL(gro_find_receive_by_type);
-
-struct packet_offload *gro_find_complete_by_type(__be16 type)
-{
- struct list_head *offload_head = &offload_base;
- struct packet_offload *ptype;
-
- list_for_each_entry_rcu(ptype, offload_head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_complete)
- continue;
- return ptype;
- }
- return NULL;
-}
-EXPORT_SYMBOL(gro_find_complete_by_type);
-
-static void napi_skb_free_stolen_head(struct sk_buff *skb)
-{
- skb_dst_drop(skb);
- skb_ext_put(skb);
- kmem_cache_free(skbuff_head_cache, skb);
-}
-
-static gro_result_t napi_skb_finish(struct napi_struct *napi,
- struct sk_buff *skb,
- gro_result_t ret)
-{
- switch (ret) {
- case GRO_NORMAL:
- gro_normal_one(napi, skb);
- break;
-
- case GRO_DROP:
- kfree_skb(skb);
- break;
-
- case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
- napi_skb_free_stolen_head(skb);
- else
- __kfree_skb(skb);
- break;
-
- case GRO_HELD:
- case GRO_MERGED:
- case GRO_CONSUMED:
- break;
- }
-
- return ret;
-}
-
-gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-{
- gro_result_t ret;
-
- skb_mark_napi_id(skb, napi);
- trace_napi_gro_receive_entry(skb);
-
- skb_gro_reset_offset(skb);
-
- ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
- trace_napi_gro_receive_exit(ret);
-
- return ret;
-}
-EXPORT_SYMBOL(napi_gro_receive);
-
-static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
-{
- if (unlikely(skb->pfmemalloc)) {
- consume_skb(skb);
- return;
- }
- __skb_pull(skb, skb_headlen(skb));
- /* restore the reserve we had after netdev_alloc_skb_ip_align() */
- skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
- __vlan_hwaccel_clear_tag(skb);
- skb->dev = napi->dev;
- skb->skb_iif = 0;
-
- /* eth_type_trans() assumes pkt_type is PACKET_HOST */
- skb->pkt_type = PACKET_HOST;
-
- skb->encapsulation = 0;
- skb_shinfo(skb)->gso_type = 0;
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
- skb_ext_reset(skb);
-
- napi->skb = skb;
-}
-
-struct sk_buff *napi_get_frags(struct napi_struct *napi)
-{
- struct sk_buff *skb = napi->skb;
-
- if (!skb) {
- skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
- if (skb) {
- napi->skb = skb;
- skb_mark_napi_id(skb, napi);
- }
- }
- return skb;
+ return true;
}
-EXPORT_SYMBOL(napi_get_frags);
-static gro_result_t napi_frags_finish(struct napi_struct *napi,
- struct sk_buff *skb,
- gro_result_t ret)
+static void flush_all_backlogs(void)
{
- switch (ret) {
- case GRO_NORMAL:
- case GRO_HELD:
- __skb_push(skb, ETH_HLEN);
- skb->protocol = eth_type_trans(skb, skb->dev);
- if (ret == GRO_NORMAL)
- gro_normal_one(napi, skb);
- break;
-
- case GRO_DROP:
- napi_reuse_skb(napi, skb);
- break;
-
- case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
- napi_skb_free_stolen_head(skb);
- else
- napi_reuse_skb(napi, skb);
- break;
-
- case GRO_MERGED:
- case GRO_CONSUMED:
- break;
- }
-
- return ret;
-}
+ static cpumask_t flush_cpus;
+ unsigned int cpu;
-/* Upper GRO stack assumes network header starts at gro_offset=0
- * Drivers could call both napi_gro_frags() and napi_gro_receive()
- * We copy ethernet header into skb->data to have a common layout.
- */
-static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
-{
- struct sk_buff *skb = napi->skb;
- const struct ethhdr *eth;
- unsigned int hlen = sizeof(*eth);
+ /* since we are under rtnl lock protection we can use static data
+ * for the cpumask and avoid allocating on stack the possibly
+ * large mask
+ */
+ ASSERT_RTNL();
- napi->skb = NULL;
+ cpus_read_lock();
- skb_reset_mac_header(skb);
- skb_gro_reset_offset(skb);
-
- if (unlikely(skb_gro_header_hard(skb, hlen))) {
- eth = skb_gro_header_slow(skb, hlen, 0);
- if (unlikely(!eth)) {
- net_warn_ratelimited("%s: dropping impossible skb from %s\n",
- __func__, napi->dev->name);
- napi_reuse_skb(napi, skb);
- return NULL;
+ cpumask_clear(&flush_cpus);
+ for_each_online_cpu(cpu) {
+ if (flush_required(cpu)) {
+ queue_work_on(cpu, system_highpri_wq,
+ per_cpu_ptr(&flush_works, cpu));
+ cpumask_set_cpu(cpu, &flush_cpus);
}
- } else {
- eth = (const struct ethhdr *)skb->data;
- gro_pull_from_frag0(skb, hlen);
- NAPI_GRO_CB(skb)->frag0 += hlen;
- NAPI_GRO_CB(skb)->frag0_len -= hlen;
}
- __skb_pull(skb, hlen);
- /*
- * This works because the only protocols we care about don't require
- * special handling.
- * We'll fix it up properly in napi_frags_finish()
+ /* we can have in flight packet[s] on the cpus we are not flushing,
+ * synchronize_net() in unregister_netdevice_many() will take care of
+ * them
*/
- skb->protocol = eth->h_proto;
-
- return skb;
-}
-
-gro_result_t napi_gro_frags(struct napi_struct *napi)
-{
- gro_result_t ret;
- struct sk_buff *skb = napi_frags_skb(napi);
-
- if (!skb)
- return GRO_DROP;
-
- trace_napi_gro_frags_entry(skb);
-
- ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
- trace_napi_gro_frags_exit(ret);
-
- return ret;
-}
-EXPORT_SYMBOL(napi_gro_frags);
-
-/* Compute the checksum from gro_offset and return the folded value
- * after adding in any pseudo checksum.
- */
-__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
-{
- __wsum wsum;
- __sum16 sum;
-
- wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
-
- /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
- sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
- /* See comments in __skb_checksum_complete(). */
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
- !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev, skb);
- }
-
- NAPI_GRO_CB(skb)->csum = wsum;
- NAPI_GRO_CB(skb)->csum_valid = 1;
+ for_each_cpu(cpu, &flush_cpus)
+ flush_work(per_cpu_ptr(&flush_works, cpu));
- return sum;
+ cpus_read_unlock();
}
-EXPORT_SYMBOL(__skb_gro_checksum_complete);
static void net_rps_send_ipi(struct softnet_data *remsd)
{
@@ -6124,7 +5922,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
net_rps_action_and_irq_enable(sd);
}
- napi->weight = dev_rx_weight;
+ napi->weight = READ_ONCE(dev_rx_weight);
while (again) {
struct sk_buff *skb;
@@ -6138,8 +5936,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
- local_irq_disable();
- rps_lock(sd);
+ rps_lock_irq_disable(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
@@ -6155,8 +5952,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
}
- rps_unlock(sd);
- local_irq_enable();
+ rps_unlock_irq_enable(sd);
}
return work;
@@ -6184,7 +5980,7 @@ EXPORT_SYMBOL(__napi_schedule);
* @n: napi context
*
* Test if NAPI routine is already running, and if not mark
- * it as running. This is used as a condition variable
+ * it as running. This is used as a condition variable to
* insure only one NAPI poll instance runs. We also make
* sure there is no pending NAPI disable.
*/
@@ -6216,17 +6012,25 @@ EXPORT_SYMBOL(napi_schedule_prep);
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
- * Variant of __napi_schedule() assuming hard irqs are masked
+ * Variant of __napi_schedule() assuming hard irqs are masked.
+ *
+ * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
+ * because the interrupt disabled assumption might not be true
+ * due to force-threaded interrupts and spinlock substitution.
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
- ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ else
+ __napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
bool napi_complete_done(struct napi_struct *n, int work_done)
{
- unsigned long flags, val, new;
+ unsigned long flags, val, new, timeout = 0;
+ bool ret = true;
/*
* 1) Don't let napi dequeue from the cpu poll list
@@ -6238,20 +6042,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL)))
return false;
+ if (work_done) {
+ if (n->gro_bitmask)
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+ }
+ if (n->defer_hard_irqs_count > 0) {
+ n->defer_hard_irqs_count--;
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ if (timeout)
+ ret = false;
+ }
if (n->gro_bitmask) {
- unsigned long timeout = 0;
-
- if (work_done)
- timeout = n->dev->gro_flush_timeout;
-
/* When the NAPI instance uses a timeout and keeps postponing
* it, we need to bound somehow the time packets are kept in
* the GRO layer
*/
napi_gro_flush(n, !!timeout);
- if (timeout)
- hrtimer_start(&n->timer, ns_to_ktime(timeout),
- HRTIMER_MODE_REL_PINNED);
}
gro_normal_list(n);
@@ -6268,7 +6075,9 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
- new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_SCHED_THREADED |
+ NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
@@ -6283,7 +6092,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
return false;
}
- return true;
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ return ret;
}
EXPORT_SYMBOL(napi_complete_done);
@@ -6302,10 +6114,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
#if defined(CONFIG_NET_RX_BUSY_POLL)
-#define BUSY_POLL_BUDGET 8
+static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
+{
+ if (!skip_schedule) {
+ gro_normal_list(napi);
+ __napi_schedule(napi);
+ return;
+ }
+
+ if (napi->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(napi, HZ >= 1000);
+ }
+
+ gro_normal_list(napi);
+ clear_bit(NAPI_STATE_SCHED, &napi->state);
+}
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
+ u16 budget)
{
+ bool skip_schedule = false;
+ unsigned long timeout;
int rc;
/* Busy polling means there is a high chance device driver hard irq
@@ -6322,29 +6154,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
local_bh_disable();
+ if (prefer_busy_poll) {
+ napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+ timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ if (napi->defer_hard_irqs_count && timeout) {
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
+ skip_schedule = true;
+ }
+ }
+
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ rc = napi->poll(napi, budget);
/* We can't gro_normal_list() here, because napi->poll() might have
* rearmed the napi (napi_complete_done()) in which case it could
* already be running on another CPU.
*/
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock);
- if (rc == BUSY_POLL_BUDGET) {
- /* As the whole budget was spent, we still own the napi so can
- * safely handle the rx_list.
- */
- gro_normal_list(napi);
- __napi_schedule(napi);
- }
+ if (rc == budget)
+ __busy_poll_stop(napi, skip_schedule);
local_bh_enable();
}
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
- void *loop_end_arg)
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
@@ -6372,17 +6208,23 @@ restart:
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
- NAPIF_STATE_IN_BUSY_POLL))
+ NAPIF_STATE_IN_BUSY_POLL)) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
- NAPIF_STATE_SCHED) != val)
+ NAPIF_STATE_SCHED) != val) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
- work = napi_poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
+ work = napi_poll(napi, budget);
+ trace_napi_poll(napi, work, budget);
gro_normal_list(napi);
count:
if (work > 0)
@@ -6395,7 +6237,7 @@ count:
if (unlikely(need_resched())) {
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
rcu_read_unlock();
cond_resched();
@@ -6406,7 +6248,7 @@ count:
cpu_relax();
}
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
out:
rcu_read_unlock();
@@ -6417,8 +6259,7 @@ EXPORT_SYMBOL(napi_busy_loop);
static void napi_hash_add(struct napi_struct *napi)
{
- if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
- test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
spin_lock(&napi_hash_lock);
@@ -6439,20 +6280,14 @@ static void napi_hash_add(struct napi_struct *napi)
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
-bool napi_hash_del(struct napi_struct *napi)
+static void napi_hash_del(struct napi_struct *napi)
{
- bool rcu_sync_needed = false;
-
spin_lock(&napi_hash_lock);
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
- rcu_sync_needed = true;
- hlist_del_rcu(&napi->napi_hash_node);
- }
+ hlist_del_init_rcu(&napi->napi_hash_node);
+
spin_unlock(&napi_hash_lock);
- return rcu_sync_needed;
}
-EXPORT_SYMBOL_GPL(napi_hash_del);
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
@@ -6463,9 +6298,11 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (napi->gro_bitmask && !napi_disable_pending(napi) &&
- !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ if (!napi_disable_pending(napi) &&
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi);
+ }
return HRTIMER_NORESTART;
}
@@ -6481,10 +6318,58 @@ static void init_gro_hash(struct napi_struct *napi)
napi->gro_bitmask = 0;
}
-void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
+int dev_set_threaded(struct net_device *dev, bool threaded)
+{
+ struct napi_struct *napi;
+ int err = 0;
+
+ if (dev->threaded == threaded)
+ return 0;
+
+ if (threaded) {
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (!napi->thread) {
+ err = napi_kthread_create(napi);
+ if (err) {
+ threaded = false;
+ break;
+ }
+ }
+ }
+ }
+
+ dev->threaded = threaded;
+
+ /* Make sure kthread is created before THREADED bit
+ * is set.
+ */
+ smp_mb__before_atomic();
+
+ /* Setting/unsetting threaded mode on a napi might not immediately
+ * take effect, if the current napi instance is actively being
+ * polled. In this case, the switch between threaded mode and
+ * softirq mode will happen in the next round of napi_schedule().
+ * This should not cause hiccups/stalls to the live traffic.
+ */
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (threaded)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ else
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(dev_set_threaded);
+
+void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int), int weight)
{
+ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
+ return;
+
INIT_LIST_HEAD(&napi->poll_list);
+ INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
napi->timer.function = napi_watchdog;
init_gro_hash(napi);
@@ -6496,25 +6381,44 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
netdev_err_once(dev, "%s() called with weight %d\n", __func__,
weight);
napi->weight = weight;
- list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
+ set_bit(NAPI_STATE_NPSVC, &napi->state);
+ list_add_rcu(&napi->dev_list, &dev->napi_list);
napi_hash_add(napi);
+ napi_get_frags_check(napi);
+ /* Create kthread for this napi if dev->threaded is set.
+ * Clear dev->threaded if kthread creation failed so that
+ * threaded mode will not be enabled in napi_enable().
+ */
+ if (dev->threaded && napi_kthread_create(napi))
+ dev->threaded = 0;
}
-EXPORT_SYMBOL(netif_napi_add);
+EXPORT_SYMBOL(netif_napi_add_weight);
void napi_disable(struct napi_struct *n)
{
+ unsigned long val, new;
+
might_sleep();
set_bit(NAPI_STATE_DISABLE, &n->state);
- while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
- msleep(1);
- while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
- msleep(1);
+ for ( ; ; ) {
+ val = READ_ONCE(n->state);
+ if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
+ usleep_range(20, 200);
+ continue;
+ }
+
+ new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
+ new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
+
+ if (cmpxchg(&n->state, val, new) == val)
+ break;
+ }
hrtimer_cancel(&n->timer);
@@ -6522,6 +6426,28 @@ void napi_disable(struct napi_struct *n)
}
EXPORT_SYMBOL(napi_disable);
+/**
+ * napi_enable - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Resume NAPI from being scheduled on this context.
+ * Must be paired with napi_disable.
+ */
+void napi_enable(struct napi_struct *n)
+{
+ unsigned long val, new;
+
+ do {
+ val = READ_ONCE(n->state);
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
+
+ new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
+ if (n->dev->threaded && n->thread)
+ new |= NAPIF_STATE_THREADED;
+ } while (cmpxchg(&n->state, val, new) != val);
+}
+EXPORT_SYMBOL(napi_enable);
+
static void flush_gro_hash(struct napi_struct *napi)
{
int i;
@@ -6536,28 +6462,29 @@ static void flush_gro_hash(struct napi_struct *napi)
}
/* Must be called in process context */
-void netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del(struct napi_struct *napi)
{
- might_sleep();
- if (napi_hash_del(napi))
- synchronize_net();
- list_del_init(&napi->dev_list);
+ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
+ return;
+
+ napi_hash_del(napi);
+ list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
flush_gro_hash(napi);
napi->gro_bitmask = 0;
+
+ if (napi->thread) {
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+ }
}
-EXPORT_SYMBOL(netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del);
-static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+static int __napi_poll(struct napi_struct *n, bool *repoll)
{
- void *have;
int work, weight;
- list_del_init(&n->poll_list);
-
- have = netpoll_poll_lock(n);
-
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
@@ -6572,10 +6499,12 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
trace_napi_poll(n, work, weight);
}
- WARN_ON_ONCE(work > weight);
+ if (unlikely(work > weight))
+ netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
+ n->poll, work, weight);
if (likely(work < weight))
- goto out_unlock;
+ return work;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
@@ -6584,7 +6513,20 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
- goto out_unlock;
+ return work;
+ }
+
+ /* The NAPI context has more processing work, but busy-polling
+ * is preferred. Exit early.
+ */
+ if (napi_prefer_busy_poll(n)) {
+ if (napi_complete_done(n, work)) {
+ /* If timeout is not set, we need to make sure
+ * that the NAPI is re-scheduled.
+ */
+ napi_schedule(n);
+ }
+ return work;
}
if (n->gro_bitmask) {
@@ -6602,23 +6544,116 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
- goto out_unlock;
+ return work;
}
- list_add_tail(&n->poll_list, repoll);
+ *repoll = true;
+
+ return work;
+}
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+ bool do_repoll = false;
+ void *have;
+ int work;
+
+ list_del_init(&n->poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ work = __napi_poll(n, &do_repoll);
+
+ if (do_repoll)
+ list_add_tail(&n->poll_list, repoll);
-out_unlock:
netpoll_poll_unlock(have);
return work;
}
+static int napi_thread_wait(struct napi_struct *napi)
+{
+ bool woken = false;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+ /* Testing SCHED_THREADED bit here to make sure the current
+ * kthread owns this napi and could poll on this napi.
+ * Testing SCHED bit is not enough because SCHED bit might be
+ * set by some other busy poll thread or by napi_disable().
+ */
+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
+ WARN_ON(!list_empty(&napi->poll_list));
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ schedule();
+ /* woken being true indicates this thread owns this napi. */
+ woken = true;
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return -1;
+}
+
+static int napi_threaded_poll(void *data)
+{
+ struct napi_struct *napi = data;
+ void *have;
+
+ while (!napi_thread_wait(napi)) {
+ for (;;) {
+ bool repoll = false;
+
+ local_bh_disable();
+
+ have = netpoll_poll_lock(napi);
+ __napi_poll(napi, &repoll);
+ netpoll_poll_unlock(have);
+
+ local_bh_enable();
+
+ if (!repoll)
+ break;
+
+ cond_resched();
+ }
+ }
+ return 0;
+}
+
+static void skb_defer_free_flush(struct softnet_data *sd)
+{
+ struct sk_buff *skb, *next;
+ unsigned long flags;
+
+ /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
+ if (!READ_ONCE(sd->defer_list))
+ return;
+
+ spin_lock_irqsave(&sd->defer_lock, flags);
+ skb = sd->defer_list;
+ sd->defer_list = NULL;
+ sd->defer_count = 0;
+ spin_unlock_irqrestore(&sd->defer_lock, flags);
+
+ while (skb != NULL) {
+ next = skb->next;
+ napi_consume_skb(skb, 1);
+ skb = next;
+ }
+}
+
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
- usecs_to_jiffies(netdev_budget_usecs);
- int budget = netdev_budget;
+ usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
+ int budget = READ_ONCE(netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
@@ -6629,9 +6664,11 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
for (;;) {
struct napi_struct *n;
+ skb_defer_free_flush(sd);
+
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- goto out;
+ goto end;
break;
}
@@ -6658,12 +6695,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
-out:
- __kfree_skb_flush();
+end:;
}
struct netdev_adjacent {
struct net_device *dev;
+ netdevice_tracker dev_tracker;
/* upper master flag, there can only be one master device per list */
bool master;
@@ -6693,9 +6730,10 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
return NULL;
}
-static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+static int ____netdev_has_upper_dev(struct net_device *upper_dev,
+ struct netdev_nested_priv *priv)
{
- struct net_device *dev = data;
+ struct net_device *dev = (struct net_device *)priv->data;
return upper_dev == dev;
}
@@ -6712,15 +6750,19 @@ static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data)
bool netdev_has_upper_dev(struct net_device *dev,
struct net_device *upper_dev)
{
+ struct netdev_nested_priv priv = {
+ .data = (void *)upper_dev,
+ };
+
ASSERT_RTNL();
return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
- upper_dev);
+ &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
/**
- * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
* @dev: device
* @upper_dev: upper device to check
*
@@ -6732,8 +6774,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
struct net_device *upper_dev)
{
+ struct netdev_nested_priv priv = {
+ .data = (void *)upper_dev,
+ };
+
return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
- upper_dev);
+ &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
@@ -6878,8 +6924,8 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
static int __netdev_walk_all_upper_dev(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
@@ -6891,7 +6937,7 @@ static int __netdev_walk_all_upper_dev(struct net_device *dev,
while (1) {
if (now != dev) {
- ret = fn(now, data);
+ ret = fn(now, priv);
if (ret)
return ret;
}
@@ -6927,8 +6973,8 @@ static int __netdev_walk_all_upper_dev(struct net_device *dev,
int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
@@ -6939,7 +6985,7 @@ int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
while (1) {
if (now != dev) {
- ret = fn(now, data);
+ ret = fn(now, priv);
if (ret)
return ret;
}
@@ -6975,10 +7021,15 @@ EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
static bool __netdev_has_upper_dev(struct net_device *dev,
struct net_device *upper_dev)
{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = (void *)upper_dev,
+ };
+
ASSERT_RTNL();
return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
- upper_dev);
+ &priv);
}
/**
@@ -7023,7 +7074,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
{
struct netdev_adjacent *lower;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
@@ -7096,8 +7147,8 @@ static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
int netdev_walk_all_lower_dev(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
@@ -7108,7 +7159,7 @@ int netdev_walk_all_lower_dev(struct net_device *dev,
while (1) {
if (now != dev) {
- ret = fn(now, data);
+ ret = fn(now, priv);
if (ret)
return ret;
}
@@ -7143,8 +7194,8 @@ EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
static int __netdev_walk_all_lower_dev(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
@@ -7156,7 +7207,7 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev,
while (1) {
if (now != dev) {
- ret = fn(now, data);
+ ret = fn(now, priv);
if (ret)
return ret;
}
@@ -7245,22 +7296,44 @@ static u8 __netdev_lower_depth(struct net_device *dev)
return max_depth;
}
-static int __netdev_update_upper_level(struct net_device *dev, void *data)
+static int __netdev_update_upper_level(struct net_device *dev,
+ struct netdev_nested_priv *__unused)
{
dev->upper_level = __netdev_upper_depth(dev) + 1;
return 0;
}
-static int __netdev_update_lower_level(struct net_device *dev, void *data)
+#ifdef CONFIG_LOCKDEP
+static LIST_HEAD(net_unlink_list);
+
+static void net_unlink_todo(struct net_device *dev)
+{
+ if (list_empty(&dev->unlink_list))
+ list_add_tail(&dev->unlink_list, &net_unlink_list);
+}
+#endif
+
+static int __netdev_update_lower_level(struct net_device *dev,
+ struct netdev_nested_priv *priv)
{
dev->lower_level = __netdev_lower_depth(dev) + 1;
+
+#ifdef CONFIG_LOCKDEP
+ if (!priv)
+ return 0;
+
+ if (priv->flags & NESTED_SYNC_IMM)
+ dev->nested_level = dev->lower_level - 1;
+ if (priv->flags & NESTED_SYNC_TODO)
+ net_unlink_todo(dev);
+#endif
return 0;
}
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
@@ -7271,7 +7344,7 @@ int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
while (1) {
if (now != dev) {
- ret = fn(now, data);
+ ret = fn(now, priv);
if (ret)
return ret;
}
@@ -7402,7 +7475,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj->ref_nr = 1;
adj->private = private;
adj->ignore = false;
- dev_hold(adj_dev);
+ netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
@@ -7431,8 +7504,8 @@ remove_symlinks:
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
+ netdev_put(adj_dev, &adj->dev_tracker);
kfree(adj);
- dev_put(adj_dev);
return ret;
}
@@ -7473,7 +7546,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
list_del_rcu(&adj->list);
pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
- dev_put(adj_dev);
+ netdev_put(adj_dev, &adj->dev_tracker);
kfree_rcu(adj, rcu);
}
@@ -7531,6 +7604,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *upper_priv, void *upper_info,
+ struct netdev_nested_priv *priv,
struct netlink_ext_ack *extack)
{
struct netdev_notifier_changeupper_info changeupper_info = {
@@ -7587,9 +7661,9 @@ static int __netdev_upper_dev_link(struct net_device *dev,
__netdev_update_upper_level(dev, NULL);
__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
- __netdev_update_lower_level(upper_dev, NULL);
+ __netdev_update_lower_level(upper_dev, priv);
__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
- NULL);
+ priv);
return 0;
@@ -7614,8 +7688,13 @@ int netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev,
struct netlink_ext_ack *extack)
{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
return __netdev_upper_dev_link(dev, upper_dev, false,
- NULL, NULL, extack);
+ NULL, NULL, &priv, extack);
}
EXPORT_SYMBOL(netdev_upper_dev_link);
@@ -7638,21 +7717,19 @@ int netdev_master_upper_dev_link(struct net_device *dev,
void *upper_priv, void *upper_info,
struct netlink_ext_ack *extack)
{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
return __netdev_upper_dev_link(dev, upper_dev, true,
- upper_priv, upper_info, extack);
+ upper_priv, upper_info, &priv, extack);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);
-/**
- * netdev_upper_dev_unlink - Removes a link to upper device
- * @dev: device
- * @upper_dev: new upper device
- *
- * Removes a link to device which is upper to this one. The caller must hold
- * the RTNL lock.
- */
-void netdev_upper_dev_unlink(struct net_device *dev,
- struct net_device *upper_dev)
+static void __netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct netdev_nested_priv *priv)
{
struct netdev_notifier_changeupper_info changeupper_info = {
.info = {
@@ -7677,9 +7754,28 @@ void netdev_upper_dev_unlink(struct net_device *dev,
__netdev_update_upper_level(dev, NULL);
__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
- __netdev_update_lower_level(upper_dev, NULL);
+ __netdev_update_lower_level(upper_dev, priv);
__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
- NULL);
+ priv);
+}
+
+/**
+ * netdev_upper_dev_unlink - Removes a link to upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Removes a link to device which is upper to this one. The caller must hold
+ * the RTNL lock.
+ */
+void netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
+ __netdev_upper_dev_unlink(dev, upper_dev, &priv);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
@@ -7715,6 +7811,10 @@ int netdev_adjacent_change_prepare(struct net_device *old_dev,
struct net_device *dev,
struct netlink_ext_ack *extack)
{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = NULL,
+ };
int err;
if (!new_dev)
@@ -7722,8 +7822,8 @@ int netdev_adjacent_change_prepare(struct net_device *old_dev,
if (old_dev && new_dev != old_dev)
netdev_adjacent_dev_disable(dev, old_dev);
-
- err = netdev_upper_dev_link(new_dev, dev, extack);
+ err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
+ extack);
if (err) {
if (old_dev && new_dev != old_dev)
netdev_adjacent_dev_enable(dev, old_dev);
@@ -7738,6 +7838,11 @@ void netdev_adjacent_change_commit(struct net_device *old_dev,
struct net_device *new_dev,
struct net_device *dev)
{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
if (!new_dev || !old_dev)
return;
@@ -7745,7 +7850,7 @@ void netdev_adjacent_change_commit(struct net_device *old_dev,
return;
netdev_adjacent_dev_enable(dev, old_dev);
- netdev_upper_dev_unlink(old_dev, dev);
+ __netdev_upper_dev_unlink(old_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_commit);
@@ -7753,13 +7858,18 @@ void netdev_adjacent_change_abort(struct net_device *old_dev,
struct net_device *new_dev,
struct net_device *dev)
{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = NULL,
+ };
+
if (!new_dev)
return;
if (old_dev && new_dev != old_dev)
netdev_adjacent_dev_enable(dev, old_dev);
- netdev_upper_dev_unlink(new_dev, dev);
+ __netdev_upper_dev_unlink(new_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_abort);
@@ -7785,6 +7895,298 @@ void netdev_bonding_info_change(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_bonding_info_change);
+static int netdev_offload_xstats_enable_l3(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+ };
+ int err;
+ int rc;
+
+ dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
+ GFP_KERNEL);
+ if (!dev->offload_xstats_l3)
+ return -ENOMEM;
+
+ rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
+ NETDEV_OFFLOAD_XSTATS_DISABLE,
+ &info.info);
+ err = notifier_to_errno(rc);
+ if (err)
+ goto free_stats;
+
+ return 0;
+
+free_stats:
+ kfree(dev->offload_xstats_l3);
+ dev->offload_xstats_l3 = NULL;
+ return err;
+}
+
+int netdev_offload_xstats_enable(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct netlink_ext_ack *extack)
+{
+ ASSERT_RTNL();
+
+ if (netdev_offload_xstats_enabled(dev, type))
+ return -EALREADY;
+
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ return netdev_offload_xstats_enable_l3(dev, extack);
+ }
+
+ WARN_ON(1);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enable);
+
+static void netdev_offload_xstats_disable_l3(struct net_device *dev)
+{
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+ };
+
+ call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
+ &info.info);
+ kfree(dev->offload_xstats_l3);
+ dev->offload_xstats_l3 = NULL;
+}
+
+int netdev_offload_xstats_disable(struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ ASSERT_RTNL();
+
+ if (!netdev_offload_xstats_enabled(dev, type))
+ return -EALREADY;
+
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ netdev_offload_xstats_disable_l3(dev);
+ return 0;
+ }
+
+ WARN_ON(1);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_disable);
+
+static void netdev_offload_xstats_disable_all(struct net_device *dev)
+{
+ netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
+}
+
+static struct rtnl_hw_stats64 *
+netdev_offload_xstats_get_ptr(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ return dev->offload_xstats_l3;
+ }
+
+ WARN_ON(1);
+ return NULL;
+}
+
+bool netdev_offload_xstats_enabled(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ ASSERT_RTNL();
+
+ return netdev_offload_xstats_get_ptr(dev, type);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enabled);
+
+struct netdev_notifier_offload_xstats_ru {
+ bool used;
+};
+
+struct netdev_notifier_offload_xstats_rd {
+ struct rtnl_hw_stats64 stats;
+ bool used;
+};
+
+static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
+ const struct rtnl_hw_stats64 *src)
+{
+ dest->rx_packets += src->rx_packets;
+ dest->tx_packets += src->tx_packets;
+ dest->rx_bytes += src->rx_bytes;
+ dest->tx_bytes += src->tx_bytes;
+ dest->rx_errors += src->rx_errors;
+ dest->tx_errors += src->tx_errors;
+ dest->rx_dropped += src->rx_dropped;
+ dest->tx_dropped += src->tx_dropped;
+ dest->multicast += src->multicast;
+}
+
+static int netdev_offload_xstats_get_used(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_ru report_used = {};
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = type,
+ .report_used = &report_used,
+ };
+ int rc;
+
+ WARN_ON(!netdev_offload_xstats_enabled(dev, type));
+ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
+ &info.info);
+ *p_used = report_used.used;
+ return notifier_to_errno(rc);
+}
+
+static int netdev_offload_xstats_get_stats(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_hw_stats64 *p_stats,
+ bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_rd report_delta = {};
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = type,
+ .report_delta = &report_delta,
+ };
+ struct rtnl_hw_stats64 *stats;
+ int rc;
+
+ stats = netdev_offload_xstats_get_ptr(dev, type);
+ if (WARN_ON(!stats))
+ return -EINVAL;
+
+ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
+ &info.info);
+
+ /* Cache whatever we got, even if there was an error, otherwise the
+ * successful stats retrievals would get lost.
+ */
+ netdev_hw_stats64_add(stats, &report_delta.stats);
+
+ if (p_stats)
+ *p_stats = *stats;
+ *p_used = report_delta.used;
+
+ return notifier_to_errno(rc);
+}
+
+int netdev_offload_xstats_get(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_hw_stats64 *p_stats, bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ ASSERT_RTNL();
+
+ if (p_stats)
+ return netdev_offload_xstats_get_stats(dev, type, p_stats,
+ p_used, extack);
+ else
+ return netdev_offload_xstats_get_used(dev, type, p_used,
+ extack);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_get);
+
+void
+netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
+ const struct rtnl_hw_stats64 *stats)
+{
+ report_delta->used = true;
+ netdev_hw_stats64_add(&report_delta->stats, stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
+
+void
+netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
+{
+ report_used->used = true;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_used);
+
+void netdev_offload_xstats_push_delta(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ const struct rtnl_hw_stats64 *p_stats)
+{
+ struct rtnl_hw_stats64 *stats;
+
+ ASSERT_RTNL();
+
+ stats = netdev_offload_xstats_get_ptr(dev, type);
+ if (WARN_ON(!stats))
+ return;
+
+ netdev_hw_stats64_add(stats, p_stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
+
+/**
+ * netdev_get_xmit_slave - Get the xmit slave of master device
+ * @dev: device
+ * @skb: The packet
+ * @all_slaves: assume all the slaves are active
+ *
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ * %NULL is returned if no slave is found.
+ */
+
+struct net_device *netdev_get_xmit_slave(struct net_device *dev,
+ struct sk_buff *skb,
+ bool all_slaves)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_xmit_slave)
+ return NULL;
+ return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
+}
+EXPORT_SYMBOL(netdev_get_xmit_slave);
+
+static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
+ struct sock *sk)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_sk_get_lower_dev)
+ return NULL;
+ return ops->ndo_sk_get_lower_dev(dev, sk);
+}
+
+/**
+ * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
+ * @dev: device
+ * @sk: the socket
+ *
+ * %NULL is returned if no lower device is found.
+ */
+
+struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
+ struct sock *sk)
+{
+ struct net_device *lower;
+
+ lower = netdev_sk_get_lower_dev(dev, sk);
+ while (lower) {
+ dev = lower;
+ lower = netdev_sk_get_lower_dev(dev, sk);
+ }
+
+ return dev;
+}
+EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
+
static void netdev_adjacent_add_links(struct net_device *dev)
{
struct netdev_adjacent *iter;
@@ -7877,7 +8279,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private);
/**
- * netdev_lower_change - Dispatch event about lower device state change
+ * netdev_lower_state_changed - Dispatch event about lower device state change
* @lower_dev: device
* @lower_state_info: state to dispatch
*
@@ -7925,8 +8327,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev->flags &= ~IFF_PROMISC;
else {
dev->promiscuity -= inc;
- pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
- dev->name);
+ netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
return -EOVERFLOW;
}
}
@@ -7996,8 +8397,7 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
dev->flags &= ~IFF_ALLMULTI;
else {
dev->allmulti -= inc;
- pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
- dev->name);
+ netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
return -EOVERFLOW;
}
}
@@ -8351,7 +8751,6 @@ void dev_set_group(struct net_device *dev, int new_group)
{
dev->group = new_group;
}
-EXPORT_SYMBOL(dev_set_group);
/**
* dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
@@ -8407,6 +8806,48 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
}
EXPORT_SYMBOL(dev_set_mac_address);
+static DECLARE_RWSEM(dev_addr_sem);
+
+int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ down_write(&dev_addr_sem);
+ ret = dev_set_mac_address(dev, sa, extack);
+ up_write(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address_user);
+
+int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
+{
+ size_t size = sizeof(sa->sa_data);
+ struct net_device *dev;
+ int ret = 0;
+
+ down_read(&dev_addr_sem);
+ rcu_read_lock();
+
+ dev = dev_get_by_name_rcu(net, dev_name);
+ if (!dev) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+ if (!dev->addr_len)
+ memset(sa->sa_data, 0, size);
+ else
+ memcpy(sa->sa_data, dev->dev_addr,
+ min_t(size_t, size, dev->addr_len));
+ sa->sa_family = dev->type;
+
+unlock:
+ rcu_read_unlock();
+ up_read(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL(dev_get_mac_address);
+
/**
* dev_change_carrier - Change device carrier
* @dev: device
@@ -8424,7 +8865,6 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier)
return -ENODEV;
return ops->ndo_change_carrier(dev, new_carrier);
}
-EXPORT_SYMBOL(dev_change_carrier);
/**
* dev_get_phys_port_id - Get device physical port ID
@@ -8442,7 +8882,6 @@ int dev_get_phys_port_id(struct net_device *dev,
return -EOPNOTSUPP;
return ops->ndo_get_phys_port_id(dev, ppid);
}
-EXPORT_SYMBOL(dev_get_phys_port_id);
/**
* dev_get_phys_port_name - Get device physical port name
@@ -8465,7 +8904,6 @@ int dev_get_phys_port_name(struct net_device *dev,
}
return devlink_compat_phys_port_name_get(dev, name, len);
}
-EXPORT_SYMBOL(dev_get_phys_port_name);
/**
* dev_get_port_parent_id - Get the device's port parent identifier
@@ -8492,20 +8930,17 @@ int dev_get_port_parent_id(struct net_device *dev,
}
err = devlink_compat_switch_id_get(dev, ppid);
- if (!err || err != -EOPNOTSUPP)
+ if (!recurse || err != -EOPNOTSUPP)
return err;
- if (!recurse)
- return -EOPNOTSUPP;
-
netdev_for_each_lower_dev(dev, lower_dev, iter) {
- err = dev_get_port_parent_id(lower_dev, ppid, recurse);
+ err = dev_get_port_parent_id(lower_dev, ppid, true);
if (err)
break;
if (!first.id_len)
first = *ppid;
else if (memcmp(&first, ppid, sizeof(*ppid)))
- return -ENODATA;
+ return -EOPNOTSUPP;
}
return err;
@@ -8532,328 +8967,581 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
EXPORT_SYMBOL(netdev_port_same_parent_id);
/**
- * dev_change_proto_down - update protocol port state information
+ * dev_change_proto_down - set carrier according to proto_down.
+ *
* @dev: device
* @proto_down: new value
- *
- * This info can be used by switch drivers to set the phys state of the
- * port.
*/
int dev_change_proto_down(struct net_device *dev, bool proto_down)
{
- const struct net_device_ops *ops = dev->netdev_ops;
-
- if (!ops->ndo_change_proto_down)
+ if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
- return ops->ndo_change_proto_down(dev, proto_down);
+ if (proto_down)
+ netif_carrier_off(dev);
+ else
+ netif_carrier_on(dev);
+ dev->proto_down = proto_down;
+ return 0;
}
-EXPORT_SYMBOL(dev_change_proto_down);
/**
- * dev_change_proto_down_generic - generic implementation for
- * ndo_change_proto_down that sets carrier according to
- * proto_down.
+ * dev_change_proto_down_reason - proto down reason
*
* @dev: device
- * @proto_down: new value
+ * @mask: proto down mask
+ * @value: proto down value
*/
-int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
+void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
+ u32 value)
{
- if (proto_down)
- netif_carrier_off(dev);
- else
- netif_carrier_on(dev);
- dev->proto_down = proto_down;
- return 0;
+ int b;
+
+ if (!mask) {
+ dev->proto_down_reason = value;
+ } else {
+ for_each_set_bit(b, &mask, 32) {
+ if (value & (1 << b))
+ dev->proto_down_reason |= BIT(b);
+ else
+ dev->proto_down_reason &= ~BIT(b);
+ }
+ }
}
-EXPORT_SYMBOL(dev_change_proto_down_generic);
-u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
- enum bpf_netdev_command cmd)
+struct bpf_xdp_link {
+ struct bpf_link link;
+ struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
+ int flags;
+};
+
+static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
{
- struct netdev_bpf xdp;
+ if (flags & XDP_FLAGS_HW_MODE)
+ return XDP_MODE_HW;
+ if (flags & XDP_FLAGS_DRV_MODE)
+ return XDP_MODE_DRV;
+ if (flags & XDP_FLAGS_SKB_MODE)
+ return XDP_MODE_SKB;
+ return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
+}
+
+static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+ switch (mode) {
+ case XDP_MODE_SKB:
+ return generic_xdp_install;
+ case XDP_MODE_DRV:
+ case XDP_MODE_HW:
+ return dev->netdev_ops->ndo_bpf;
+ default:
+ return NULL;
+ }
+}
- if (!bpf_op)
- return 0;
+static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
+ enum bpf_xdp_mode mode)
+{
+ return dev->xdp_state[mode].link;
+}
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = cmd;
+static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
+ enum bpf_xdp_mode mode)
+{
+ struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
- /* Query must always succeed. */
- WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
+ if (link)
+ return link->link.prog;
+ return dev->xdp_state[mode].prog;
+}
- return xdp.prog_id;
+u8 dev_xdp_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
+ count++;
+ return count;
+}
+EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
+
+u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+ struct bpf_prog *prog = dev_xdp_prog(dev, mode);
+
+ return prog ? prog->aux->id : 0;
+}
+
+static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
+ struct bpf_xdp_link *link)
+{
+ dev->xdp_state[mode].link = link;
+ dev->xdp_state[mode].prog = NULL;
+}
+
+static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
+ struct bpf_prog *prog)
+{
+ dev->xdp_state[mode].link = NULL;
+ dev->xdp_state[mode].prog = prog;
}
-static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
- struct netlink_ext_ack *extack, u32 flags,
- struct bpf_prog *prog)
+static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
+ bpf_op_t bpf_op, struct netlink_ext_ack *extack,
+ u32 flags, struct bpf_prog *prog)
{
- bool non_hw = !(flags & XDP_FLAGS_HW_MODE);
- struct bpf_prog *prev_prog = NULL;
struct netdev_bpf xdp;
int err;
- if (non_hw) {
- prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op,
- XDP_QUERY_PROG));
- if (IS_ERR(prev_prog))
- prev_prog = NULL;
- }
-
memset(&xdp, 0, sizeof(xdp));
- if (flags & XDP_FLAGS_HW_MODE)
- xdp.command = XDP_SETUP_PROG_HW;
- else
- xdp.command = XDP_SETUP_PROG;
+ xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
xdp.extack = extack;
xdp.flags = flags;
xdp.prog = prog;
+ /* Drivers assume refcnt is already incremented (i.e, prog pointer is
+ * "moved" into driver), so they don't increment it on their own, but
+ * they do decrement refcnt when program is detached or replaced.
+ * Given net_device also owns link/prog, we need to bump refcnt here
+ * to prevent drivers from underflowing it.
+ */
+ if (prog)
+ bpf_prog_inc(prog);
err = bpf_op(dev, &xdp);
- if (!err && non_hw)
- bpf_prog_change_xdp(prev_prog, prog);
+ if (err) {
+ if (prog)
+ bpf_prog_put(prog);
+ return err;
+ }
- if (prev_prog)
- bpf_prog_put(prev_prog);
+ if (mode != XDP_MODE_HW)
+ bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
- return err;
+ return 0;
}
static void dev_xdp_uninstall(struct net_device *dev)
{
- struct netdev_bpf xdp;
- bpf_op_t ndo_bpf;
+ struct bpf_xdp_link *link;
+ struct bpf_prog *prog;
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
- /* Remove generic XDP */
- WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
+ ASSERT_RTNL();
- /* Remove from the driver */
- ndo_bpf = dev->netdev_ops->ndo_bpf;
- if (!ndo_bpf)
- return;
+ for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
+ prog = dev_xdp_prog(dev, mode);
+ if (!prog)
+ continue;
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG;
- WARN_ON(ndo_bpf(dev, &xdp));
- if (xdp.prog_id)
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
- NULL));
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ if (!bpf_op)
+ continue;
- /* Remove HW offload */
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG_HW;
- if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
- NULL));
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+
+ /* auto-detach link from net device */
+ link = dev_xdp_link(dev, mode);
+ if (link)
+ link->dev = NULL;
+ else
+ bpf_prog_put(prog);
+
+ dev_xdp_set_link(dev, mode, NULL);
+ }
}
-/**
- * dev_change_xdp_fd - set or clear a bpf program for a device rx path
- * @dev: device
- * @extack: netlink extended ack
- * @fd: new program fd or negative value to clear
- * @flags: xdp-related flags
- *
- * Set or clear a bpf program for a device
- */
-int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
- int fd, u32 flags)
+static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog, u32 flags)
{
- const struct net_device_ops *ops = dev->netdev_ops;
- enum bpf_netdev_command query;
- struct bpf_prog *prog = NULL;
- bpf_op_t bpf_op, bpf_chk;
- bool offload;
+ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
+ struct bpf_prog *cur_prog;
+ struct net_device *upper;
+ struct list_head *iter;
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
int err;
ASSERT_RTNL();
- offload = flags & XDP_FLAGS_HW_MODE;
- query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
-
- bpf_op = bpf_chk = ops->ndo_bpf;
- if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
- NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
- return -EOPNOTSUPP;
+ /* either link or prog attachment, never both */
+ if (link && (new_prog || old_prog))
+ return -EINVAL;
+ /* link supports only XDP mode flags */
+ if (link && (flags & ~XDP_FLAGS_MODES)) {
+ NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
+ return -EINVAL;
+ }
+ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
+ if (num_modes > 1) {
+ NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
+ return -EINVAL;
+ }
+ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
+ if (!num_modes && dev_xdp_prog_count(dev) > 1) {
+ NL_SET_ERR_MSG(extack,
+ "More than one program loaded, unset mode is ambiguous");
+ return -EINVAL;
+ }
+ /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
+ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
+ NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
+ return -EINVAL;
}
- if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
- bpf_op = generic_xdp_install;
- if (bpf_op == bpf_chk)
- bpf_chk = generic_xdp_install;
- if (fd >= 0) {
- u32 prog_id;
+ mode = dev_xdp_mode(dev, flags);
+ /* can't replace attached link */
+ if (dev_xdp_link(dev, mode)) {
+ NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
+ return -EBUSY;
+ }
- if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
- NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
+ /* don't allow if an upper device already has a program */
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ if (dev_xdp_prog_count(upper) > 0) {
+ NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
return -EEXIST;
}
+ }
+
+ cur_prog = dev_xdp_prog(dev, mode);
+ /* can't replace attached prog with link */
+ if (link && cur_prog) {
+ NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
+ return -EBUSY;
+ }
+ if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
+ NL_SET_ERR_MSG(extack, "Active program does not match expected");
+ return -EEXIST;
+ }
+
+ /* put effective new program into new_prog */
+ if (link)
+ new_prog = link->link.prog;
- prog_id = __dev_xdp_query(dev, bpf_op, query);
- if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
+ if (new_prog) {
+ bool offload = mode == XDP_MODE_HW;
+ enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
+ ? XDP_MODE_DRV : XDP_MODE_SKB;
+
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
NL_SET_ERR_MSG(extack, "XDP program already attached");
return -EBUSY;
}
-
- prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
- bpf_op == ops->ndo_bpf);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
- if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
- NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
- bpf_prog_put(prog);
+ if (!offload && dev_xdp_prog(dev, other_mode)) {
+ NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
+ return -EEXIST;
+ }
+ if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
+ NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
+ return -EINVAL;
+ }
+ if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
+ NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
+ return -EINVAL;
+ }
+ if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
+ NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
return -EINVAL;
}
+ }
- /* prog->aux->id may be 0 for orphaned device-bound progs */
- if (prog->aux->id && prog->aux->id == prog_id) {
- bpf_prog_put(prog);
- return 0;
+ /* don't call drivers if the effective program didn't change */
+ if (new_prog != cur_prog) {
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ if (!bpf_op) {
+ NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
+ return -EOPNOTSUPP;
}
- } else {
- if (!__dev_xdp_query(dev, bpf_op, query))
- return 0;
+
+ err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
+ if (err)
+ return err;
}
- err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
- if (err < 0 && prog)
- bpf_prog_put(prog);
+ if (link)
+ dev_xdp_set_link(dev, mode, link);
+ else
+ dev_xdp_set_prog(dev, mode, new_prog);
+ if (cur_prog)
+ bpf_prog_put(cur_prog);
- return err;
+ return 0;
}
-/**
- * dev_new_index - allocate an ifindex
- * @net: the applicable net namespace
- *
- * Returns a suitable unique value for a new device interface
- * number. The caller must hold the rtnl semaphore or the
- * dev_base_lock to be sure it remains unique.
- */
-static int dev_new_index(struct net *net)
+static int dev_xdp_attach_link(struct net_device *dev,
+ struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link)
{
- int ifindex = net->ifindex;
+ return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
+}
- for (;;) {
- if (++ifindex <= 0)
- ifindex = 1;
- if (!__dev_get_by_index(net, ifindex))
- return net->ifindex = ifindex;
+static int dev_xdp_detach_link(struct net_device *dev,
+ struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link)
+{
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+
+ ASSERT_RTNL();
+
+ mode = dev_xdp_mode(dev, link->flags);
+ if (dev_xdp_link(dev, mode) != link)
+ return -EINVAL;
+
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+ dev_xdp_set_link(dev, mode, NULL);
+ return 0;
+}
+
+static void bpf_xdp_link_release(struct bpf_link *link)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+ rtnl_lock();
+
+ /* if racing with net_device's tear down, xdp_link->dev might be
+ * already NULL, in which case link was already auto-detached
+ */
+ if (xdp_link->dev) {
+ WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+ xdp_link->dev = NULL;
}
+
+ rtnl_unlock();
}
-/* Delayed registration/unregisteration */
-static LIST_HEAD(net_todo_list);
-DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+static int bpf_xdp_link_detach(struct bpf_link *link)
+{
+ bpf_xdp_link_release(link);
+ return 0;
+}
-static void net_set_todo(struct net_device *dev)
+static void bpf_xdp_link_dealloc(struct bpf_link *link)
{
- list_add_tail(&dev->todo_list, &net_todo_list);
- dev_net(dev)->dev_unreg_count++;
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+ kfree(xdp_link);
}
-static void rollback_registered_many(struct list_head *head)
+static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
{
- struct net_device *dev, *tmp;
- LIST_HEAD(close_head);
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ u32 ifindex = 0;
- BUG_ON(dev_boot_phase);
- ASSERT_RTNL();
+ rtnl_lock();
+ if (xdp_link->dev)
+ ifindex = xdp_link->dev->ifindex;
+ rtnl_unlock();
- list_for_each_entry_safe(dev, tmp, head, unreg_list) {
- /* Some devices call without registering
- * for initialization unwind. Remove those
- * devices and proceed with the remaining.
- */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- pr_debug("unregister_netdevice: device %s/%p never was registered\n",
- dev->name, dev);
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+}
- WARN_ON(1);
- list_del(&dev->unreg_list);
- continue;
- }
- dev->dismantle = true;
- BUG_ON(dev->reg_state != NETREG_REGISTERED);
- }
+static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ u32 ifindex = 0;
- /* If device is running, close it first. */
- list_for_each_entry(dev, head, unreg_list)
- list_add_tail(&dev->close_list, &close_head);
- dev_close_many(&close_head, true);
+ rtnl_lock();
+ if (xdp_link->dev)
+ ifindex = xdp_link->dev->ifindex;
+ rtnl_unlock();
- list_for_each_entry(dev, head, unreg_list) {
- /* And unlink it from device chain. */
- unlist_netdevice(dev);
+ info->xdp.ifindex = ifindex;
+ return 0;
+}
- dev->reg_state = NETREG_UNREGISTERING;
+static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+ int err = 0;
+
+ rtnl_lock();
+
+ /* link might have been auto-released already, so fail */
+ if (!xdp_link->dev) {
+ err = -ENOLINK;
+ goto out_unlock;
}
- flush_all_backlogs();
- synchronize_net();
+ if (old_prog && link->prog != old_prog) {
+ err = -EPERM;
+ goto out_unlock;
+ }
+ old_prog = link->prog;
+ if (old_prog->type != new_prog->type ||
+ old_prog->expected_attach_type != new_prog->expected_attach_type) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
- list_for_each_entry(dev, head, unreg_list) {
- struct sk_buff *skb = NULL;
+ if (old_prog == new_prog) {
+ /* no-op, don't disturb drivers */
+ bpf_prog_put(new_prog);
+ goto out_unlock;
+ }
- /* Shutdown queueing discipline. */
- dev_shutdown(dev);
+ mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
+ bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
+ err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
+ xdp_link->flags, new_prog);
+ if (err)
+ goto out_unlock;
- dev_xdp_uninstall(dev);
+ old_prog = xchg(&link->prog, new_prog);
+ bpf_prog_put(old_prog);
- /* Notify protocols, that we are about to destroy
- * this device. They should clean all the things.
- */
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+out_unlock:
+ rtnl_unlock();
+ return err;
+}
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
- GFP_KERNEL, NULL, 0);
+static const struct bpf_link_ops bpf_xdp_link_lops = {
+ .release = bpf_xdp_link_release,
+ .dealloc = bpf_xdp_link_dealloc,
+ .detach = bpf_xdp_link_detach,
+ .show_fdinfo = bpf_xdp_link_show_fdinfo,
+ .fill_link_info = bpf_xdp_link_fill_link_info,
+ .update_prog = bpf_xdp_link_update,
+};
- /*
- * Flush the unicast and multicast chains
- */
- dev_uc_flush(dev);
- dev_mc_flush(dev);
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct bpf_xdp_link *link;
+ struct net_device *dev;
+ int err, fd;
- netdev_name_node_alt_flush(dev);
- netdev_name_node_free(dev->name_node);
+ rtnl_lock();
+ dev = dev_get_by_index(net, attr->link_create.target_ifindex);
+ if (!dev) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
- if (dev->netdev_ops->ndo_uninit)
- dev->netdev_ops->ndo_uninit(dev);
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto unlock;
+ }
- if (skb)
- rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
+ link->dev = dev;
+ link->flags = attr->link_create.flags;
- /* Notifier chain MUST detach us all upper devices. */
- WARN_ON(netdev_has_any_upper_dev(dev));
- WARN_ON(netdev_has_any_lower_dev(dev));
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto unlock;
+ }
- /* Remove entries from kobject tree */
- netdev_unregister_kobject(dev);
-#ifdef CONFIG_XPS
- /* Remove XPS queueing entries */
- netif_reset_xps_queues_gt(dev, 0);
-#endif
+ err = dev_xdp_attach_link(dev, NULL, link);
+ rtnl_unlock();
+
+ if (err) {
+ link->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ goto out_put_dev;
}
- synchronize_net();
+ fd = bpf_link_settle(&link_primer);
+ /* link itself doesn't hold dev's refcnt to not complicate shutdown */
+ dev_put(dev);
+ return fd;
- list_for_each_entry(dev, head, unreg_list)
- dev_put(dev);
+unlock:
+ rtnl_unlock();
+
+out_put_dev:
+ dev_put(dev);
+ return err;
}
-static void rollback_registered(struct net_device *dev)
+/**
+ * dev_change_xdp_fd - set or clear a bpf program for a device rx path
+ * @dev: device
+ * @extack: netlink extended ack
+ * @fd: new program fd or negative value to clear
+ * @expected_fd: old program fd that userspace expects to replace or clear
+ * @flags: xdp-related flags
+ *
+ * Set or clear a bpf program for a device
+ */
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, int expected_fd, u32 flags)
{
- LIST_HEAD(single);
+ enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
+ struct bpf_prog *new_prog = NULL, *old_prog = NULL;
+ int err;
- list_add(&dev->unreg_list, &single);
- rollback_registered_many(&single);
- list_del(&single);
+ ASSERT_RTNL();
+
+ if (fd >= 0) {
+ new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+ mode != XDP_MODE_SKB);
+ if (IS_ERR(new_prog))
+ return PTR_ERR(new_prog);
+ }
+
+ if (expected_fd >= 0) {
+ old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
+ mode != XDP_MODE_SKB);
+ if (IS_ERR(old_prog)) {
+ err = PTR_ERR(old_prog);
+ old_prog = NULL;
+ goto err_out;
+ }
+ }
+
+ err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
+
+err_out:
+ if (err && new_prog)
+ bpf_prog_put(new_prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ return err;
+}
+
+/**
+ * dev_new_index - allocate an ifindex
+ * @net: the applicable net namespace
+ *
+ * Returns a suitable unique value for a new device interface
+ * number. The caller must hold the rtnl semaphore or the
+ * dev_base_lock to be sure it remains unique.
+ */
+static int dev_new_index(struct net *net)
+{
+ int ifindex = net->ifindex;
+
+ for (;;) {
+ if (++ifindex <= 0)
+ ifindex = 1;
+ if (!__dev_get_by_index(net, ifindex))
+ return net->ifindex = ifindex;
+ }
+}
+
+/* Delayed registration/unregisteration */
+LIST_HEAD(net_todo_list);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+
+static void net_set_todo(struct net_device *dev)
+{
+ list_add_tail(&dev->todo_list, &net_todo_list);
+ atomic_inc(&dev_net(dev)->dev_unreg_count);
}
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -8889,11 +9577,13 @@ static void netdev_sync_lower_features(struct net_device *upper,
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
lower->wanted_features &= ~feature;
- netdev_update_features(lower);
+ __netdev_update_features(lower);
if (unlikely(lower->features & feature))
netdev_WARN(upper, "failed to disable %pNF on %s!\n",
&feature, lower->name);
+ else
+ netdev_features_change(lower);
}
}
}
@@ -8974,6 +9664,27 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
+ netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
+ features &= ~NETIF_F_LRO;
+ }
+
+ if (features & NETIF_F_HW_TLS_TX) {
+ bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
+ (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
+ bool hw_csum = features & NETIF_F_HW_CSUM;
+
+ if (!ip_csum && !hw_csum) {
+ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_TX;
+ }
+ }
+
+ if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
+ netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_RX;
+ }
+
return features;
}
@@ -8994,7 +9705,7 @@ int __netdev_update_features(struct net_device *dev)
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
- /* some features can't be enabled if they're off an an upper device */
+ /* some features can't be enabled if they're off on an upper device */
netdev_for_each_upper_dev_rcu(dev, upper, iter)
features = netdev_sync_upper_features(dev, upper, features);
@@ -9118,6 +9829,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
else
netif_dormant_off(dev);
+ if (rootdev->operstate == IF_OPER_TESTING)
+ netif_testing_on(dev);
+ else
+ netif_testing_off(dev);
+
if (netif_carrier_ok(rootdev))
netif_carrier_on(dev);
else
@@ -9134,7 +9850,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
BUG_ON(count < 1);
- rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!rx)
return -ENOMEM;
@@ -9144,7 +9860,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
rx[i].dev = dev;
/* XDP RX-queue setup */
- err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
+ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
if (err < 0)
goto err_rxq_info;
}
@@ -9178,7 +9894,7 @@ static void netdev_init_one_queue(struct net_device *dev,
{
/* Initialize queue lock */
spin_lock_init(&queue->_xmit_lock);
- lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key);
+ netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
queue->xmit_lock_owner = -1;
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
queue->dev = dev;
@@ -9201,7 +9917,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
if (count < 1 || count > 0xffff)
return -EINVAL;
- tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!tx)
return -ENOMEM;
@@ -9225,48 +9941,15 @@ void netif_tx_stop_all_queues(struct net_device *dev)
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);
-static void netdev_register_lockdep_key(struct net_device *dev)
-{
- lockdep_register_key(&dev->qdisc_tx_busylock_key);
- lockdep_register_key(&dev->qdisc_running_key);
- lockdep_register_key(&dev->qdisc_xmit_lock_key);
- lockdep_register_key(&dev->addr_list_lock_key);
-}
-
-static void netdev_unregister_lockdep_key(struct net_device *dev)
-{
- lockdep_unregister_key(&dev->qdisc_tx_busylock_key);
- lockdep_unregister_key(&dev->qdisc_running_key);
- lockdep_unregister_key(&dev->qdisc_xmit_lock_key);
- lockdep_unregister_key(&dev->addr_list_lock_key);
-}
-
-void netdev_update_lockdep_key(struct net_device *dev)
-{
- lockdep_unregister_key(&dev->addr_list_lock_key);
- lockdep_register_key(&dev->addr_list_lock_key);
-
- lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
-}
-EXPORT_SYMBOL(netdev_update_lockdep_key);
-
/**
- * register_netdevice - register a network device
- * @dev: device to register
- *
- * Take a completed network device structure and add it to the kernel
- * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
- * chain. 0 is returned on success. A negative errno code is returned
- * on a failure to set up the device, or if the name is a duplicate.
- *
- * Callers must hold the rtnl semaphore. You may want
- * register_netdev() instead of this.
+ * register_netdevice() - register a network device
+ * @dev: device to register
*
- * BUGS:
- * The locking appears insufficient to guarantee two parallel registers
- * will not get the same name.
+ * Take a prepared network device structure and make it externally accessible.
+ * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
+ * Callers must hold the rtnl lock - you may want register_netdev()
+ * instead of this.
*/
-
int register_netdevice(struct net_device *dev)
{
int ret;
@@ -9283,8 +9966,12 @@ int register_netdevice(struct net_device *dev)
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
+ ret = ethtool_check_ops(dev->ethtool_ops);
+ if (ret)
+ return ret;
+
spin_lock_init(&dev->addr_list_lock);
- lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
+ netdev_set_addr_lockdep_class(dev);
ret = dev_get_valid_name(net, dev, dev->name);
if (ret < 0)
@@ -9326,7 +10013,7 @@ int register_netdevice(struct net_device *dev)
dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
dev->features |= NETIF_F_SOFT_FEATURES;
- if (dev->netdev_ops->ndo_udp_tunnel_add) {
+ if (dev->udp_tunnel_nic_info) {
dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
}
@@ -9368,11 +10055,11 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
ret = netdev_register_kobject(dev);
- if (ret) {
- dev->reg_state = NETREG_UNREGISTERED;
+ write_lock(&dev_base_lock);
+ dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
+ write_unlock(&dev_base_lock);
+ if (ret)
goto err_uninit;
- }
- dev->reg_state = NETREG_REGISTERED;
__netdev_update_features(dev);
@@ -9386,8 +10073,10 @@ int register_netdevice(struct net_device *dev)
linkwatch_init_dev(dev);
dev_init_scheduler(dev);
- dev_hold(dev);
+
+ netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
list_netdevice(dev);
+
add_device_randomness(dev->dev_addr, dev->addr_len);
/* If the device has permanent device address, driver should
@@ -9401,10 +10090,10 @@ int register_netdevice(struct net_device *dev)
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
if (ret) {
- rollback_registered(dev);
- rcu_barrier();
-
- dev->reg_state = NETREG_UNREGISTERED;
+ /* Expect explicit free_netdev() on failure */
+ dev->needs_free_netdev = false;
+ unregister_netdevice_queue(dev, NULL);
+ goto out;
}
/*
* Prevent userspace races by waiting until the network
@@ -9499,17 +10188,25 @@ EXPORT_SYMBOL(register_netdev);
int netdev_refcnt_read(const struct net_device *dev)
{
+#ifdef CONFIG_PCPU_DEV_REFCNT
int i, refcnt = 0;
for_each_possible_cpu(i)
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
return refcnt;
+#else
+ return refcount_read(&dev->dev_refcnt);
+#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);
+int netdev_unregister_timeout_secs __read_mostly = 10;
+
+#define WAIT_REFS_MIN_MSECS 1
+#define WAIT_REFS_MAX_MSECS 250
/**
- * netdev_wait_allrefs - wait until all references are gone.
- * @dev: target net_device
+ * netdev_wait_allrefs_any - wait until all references are gone.
+ * @list: list of net_devices to wait on
*
* This is called when unregistering network devices.
*
@@ -9519,50 +10216,68 @@ EXPORT_SYMBOL(netdev_refcnt_read);
* We can get stuck here if buggy protocols don't correctly
* call dev_put.
*/
-static void netdev_wait_allrefs(struct net_device *dev)
+static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
{
unsigned long rebroadcast_time, warning_time;
- int refcnt;
-
- linkwatch_forget_dev(dev);
+ struct net_device *dev;
+ int wait = 0;
rebroadcast_time = warning_time = jiffies;
- refcnt = netdev_refcnt_read(dev);
- while (refcnt != 0) {
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
+
+ while (true) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
/* Rebroadcast unregister notification */
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ list_for_each_entry(dev, list, todo_list)
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
__rtnl_unlock();
rcu_barrier();
rtnl_lock();
- if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
- &dev->state)) {
- /* We must not have linkwatch events
- * pending on unregister. If this
- * happens, we simply run the queue
- * unscheduled, resulting in a noop
- * for this device.
- */
- linkwatch_run_queue();
- }
+ list_for_each_entry(dev, list, todo_list)
+ if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+ &dev->state)) {
+ /* We must not have linkwatch events
+ * pending on unregister. If this
+ * happens, we simply run the queue
+ * unscheduled, resulting in a noop
+ * for this device.
+ */
+ linkwatch_run_queue();
+ break;
+ }
__rtnl_unlock();
rebroadcast_time = jiffies;
}
- msleep(250);
+ if (!wait) {
+ rcu_barrier();
+ wait = WAIT_REFS_MIN_MSECS;
+ } else {
+ msleep(wait);
+ wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
+ }
+
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
- refcnt = netdev_refcnt_read(dev);
+ if (time_after(jiffies, warning_time +
+ READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
+ list_for_each_entry(dev, list, todo_list) {
+ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+ dev->name, netdev_refcnt_read(dev));
+ ref_tracker_dir_print(&dev->refcnt_tracker, 10);
+ }
- if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
- pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
- dev->name, refcnt);
warning_time = jiffies;
}
}
@@ -9594,53 +10309,62 @@ static void netdev_wait_allrefs(struct net_device *dev)
*/
void netdev_run_todo(void)
{
+ struct net_device *dev, *tmp;
struct list_head list;
+#ifdef CONFIG_LOCKDEP
+ struct list_head unlink_list;
+
+ list_replace_init(&net_unlink_list, &unlink_list);
+
+ while (!list_empty(&unlink_list)) {
+ struct net_device *dev = list_first_entry(&unlink_list,
+ struct net_device,
+ unlink_list);
+ list_del_init(&dev->unlink_list);
+ dev->nested_level = dev->lower_level - 1;
+ }
+#endif
/* Snapshot list, allow later requests */
list_replace_init(&net_todo_list, &list);
__rtnl_unlock();
-
/* Wait for rcu callbacks to finish before next phase */
if (!list_empty(&list))
rcu_barrier();
- while (!list_empty(&list)) {
- struct net_device *dev
- = list_first_entry(&list, struct net_device, todo_list);
- list_del(&dev->todo_list);
-
+ list_for_each_entry_safe(dev, tmp, &list, todo_list) {
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
- pr_err("network todo '%s' but state %d\n",
- dev->name, dev->reg_state);
- dump_stack();
+ netdev_WARN(dev, "run_todo but not unregistering\n");
+ list_del(&dev->todo_list);
continue;
}
+ write_lock(&dev_base_lock);
dev->reg_state = NETREG_UNREGISTERED;
+ write_unlock(&dev_base_lock);
+ linkwatch_forget_dev(dev);
+ }
- netdev_wait_allrefs(dev);
+ while (!list_empty(&list)) {
+ dev = netdev_wait_allrefs_any(&list);
+ list_del(&dev->todo_list);
/* paranoia */
- BUG_ON(netdev_refcnt_read(dev));
+ BUG_ON(netdev_refcnt_read(dev) != 1);
BUG_ON(!list_empty(&dev->ptype_all));
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
-#if IS_ENABLED(CONFIG_DECNET)
- WARN_ON(dev->dn_ptr);
-#endif
+
if (dev->priv_destructor)
dev->priv_destructor(dev);
if (dev->needs_free_netdev)
free_netdev(dev);
- /* Report a network device has been unregistered */
- rtnl_lock();
- dev_net(dev)->dev_unreg_count--;
- __rtnl_unlock();
- wake_up(&netdev_unregistering_wq);
+ if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
+ wake_up(&netdev_unregistering_wq);
/* Free network device */
kobject_put(&dev->dev.kobj);
@@ -9676,6 +10400,21 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
+struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
+{
+ struct net_device_core_stats __percpu *p;
+
+ p = alloc_percpu_gfp(struct net_device_core_stats,
+ GFP_ATOMIC | __GFP_NOWARN);
+
+ if (p && cmpxchg(&dev->core_stats, NULL, p))
+ free_percpu(p);
+
+ /* This READ_ONCE() pairs with the cmpxchg() above */
+ return READ_ONCE(dev->core_stats);
+}
+EXPORT_SYMBOL(netdev_core_stats_alloc);
+
/**
* dev_get_stats - get network device statistics
* @dev: device to get statistics from
@@ -9690,6 +10429,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
struct rtnl_link_stats64 *storage)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ const struct net_device_core_stats __percpu *p;
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
@@ -9699,13 +10439,74 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
- storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
- storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
- storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
+
+ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+ p = READ_ONCE(dev->core_stats);
+ if (p) {
+ const struct net_device_core_stats *core_stats;
+ int i;
+
+ for_each_possible_cpu(i) {
+ core_stats = per_cpu_ptr(p, i);
+ storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
+ storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
+ storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
+ storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
+ }
+ }
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
+/**
+ * dev_fetch_sw_netstats - get per-cpu network device statistics
+ * @s: place to store stats
+ * @netstats: per-cpu network stats to read from
+ *
+ * Read per-cpu network statistics and populate the related fields in @s.
+ */
+void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_sw_netstats __percpu *netstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ const struct pcpu_sw_netstats *stats;
+ unsigned int start;
+
+ stats = per_cpu_ptr(netstats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ rx_packets = u64_stats_read(&stats->rx_packets);
+ rx_bytes = u64_stats_read(&stats->rx_bytes);
+ tx_packets = u64_stats_read(&stats->tx_packets);
+ tx_bytes = u64_stats_read(&stats->tx_bytes);
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ s->rx_packets += rx_packets;
+ s->rx_bytes += rx_bytes;
+ s->tx_packets += tx_packets;
+ s->tx_bytes += tx_bytes;
+ }
+}
+EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
+
+/**
+ * dev_get_tstats64 - ndo_get_stats64 implementation
+ * @dev: device to get statistics from
+ * @s: place to store stats
+ *
+ * Populate @s from dev->stats and dev->tstats. Can be used as
+ * ndo_get_stats64() callback.
+ */
+void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
+{
+ netdev_stats_to_stats64(s, &dev->stats);
+ dev_fetch_sw_netstats(s, dev->tstats);
+}
+EXPORT_SYMBOL_GPL(dev_get_tstats64);
+
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
struct netdev_queue *queue = dev_ingress_queue(dev);
@@ -9784,16 +10585,22 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
- p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!p)
return NULL;
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
+ ref_tracker_dir_init(&dev->refcnt_tracker, 128);
+#ifdef CONFIG_PCPU_DEV_REFCNT
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
+ __dev_hold(dev);
+#else
+ refcount_set(&dev->dev_refcnt, 1);
+#endif
if (dev_addr_init(dev))
goto free_pcpu;
@@ -9803,12 +10610,17 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
- netdev_register_lockdep_key(dev);
-
- dev->gso_max_size = GSO_MAX_SIZE;
+ dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
+ dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
+ dev->tso_max_segs = TSO_MAX_SEGS;
dev->upper_level = 1;
dev->lower_level = 1;
+#ifdef CONFIG_LOCKDEP
+ dev->nested_level = 0;
+ INIT_LIST_HEAD(&dev->unlink_list);
+#endif
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
@@ -9846,7 +10658,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
- nf_hook_ingress_init(dev);
+ nf_hook_netdev_init(dev);
return dev;
@@ -9855,8 +10667,10 @@ free_all:
return NULL;
free_pcpu:
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
free_dev:
+#endif
netdev_freemem(dev);
return NULL;
}
@@ -9876,6 +10690,17 @@ void free_netdev(struct net_device *dev)
struct napi_struct *p, *n;
might_sleep();
+
+ /* When called immediately after register_netdevice() failed the unwind
+ * handling may still be dismantling the device. Handle that case by
+ * deferring the free.
+ */
+ if (dev->reg_state == NETREG_UNREGISTERING) {
+ ASSERT_RTNL();
+ dev->needs_free_netdev = true;
+ return;
+ }
+
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -9887,13 +10712,16 @@ void free_netdev(struct net_device *dev)
list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
netif_napi_del(p);
+ ref_tracker_dir_exit(&dev->refcnt_tracker);
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
+#endif
+ free_percpu(dev->core_stats);
+ dev->core_stats = NULL;
free_percpu(dev->xdp_bulkq);
dev->xdp_bulkq = NULL;
- netdev_unregister_lockdep_key(dev);
-
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
netdev_freemem(dev);
@@ -9944,9 +10772,10 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
if (head) {
list_move_tail(&dev->unreg_list, head);
} else {
- rollback_registered(dev);
- /* Finish processing unregister after unlock */
- net_set_todo(dev);
+ LIST_HEAD(single);
+
+ list_add(&dev->unreg_list, &single);
+ unregister_netdevice_many(&single);
}
}
EXPORT_SYMBOL(unregister_netdevice_queue);
@@ -9960,14 +10789,103 @@ EXPORT_SYMBOL(unregister_netdevice_queue);
*/
void unregister_netdevice_many(struct list_head *head)
{
- struct net_device *dev;
+ struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
- if (!list_empty(head)) {
- rollback_registered_many(head);
- list_for_each_entry(dev, head, unreg_list)
- net_set_todo(dev);
- list_del(head);
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ if (list_empty(head))
+ return;
+
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+ /* Some devices call without registering
+ * for initialization unwind. Remove those
+ * devices and proceed with the remaining.
+ */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ dev->name, dev);
+
+ WARN_ON(1);
+ list_del(&dev->unreg_list);
+ continue;
+ }
+ dev->dismantle = true;
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ }
+
+ /* If device is running, close it first. */
+ list_for_each_entry(dev, head, unreg_list)
+ list_add_tail(&dev->close_list, &close_head);
+ dev_close_many(&close_head, true);
+
+ list_for_each_entry(dev, head, unreg_list) {
+ /* And unlink it from device chain. */
+ write_lock(&dev_base_lock);
+ unlist_netdevice(dev, false);
+ dev->reg_state = NETREG_UNREGISTERING;
+ write_unlock(&dev_base_lock);
+ }
+ flush_all_backlogs();
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
+
+ /* Shutdown queueing discipline. */
+ dev_shutdown(dev);
+
+ dev_xdp_uninstall(dev);
+
+ netdev_offload_xstats_disable_all(dev);
+
+ /* Notify protocols, that we are about to destroy
+ * this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ if (!dev->rtnl_link_ops ||
+ dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
+ GFP_KERNEL, NULL, 0);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ netdev_name_node_alt_flush(dev);
+ netdev_name_node_free(dev->name_node);
+
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
+
+ /* Remove entries from kobject tree */
+ netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ }
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ netdev_put(dev, &dev->dev_registered_tracker);
+ net_set_todo(dev);
}
+
+ list_del(head);
}
EXPORT_SYMBOL(unregister_netdevice_many);
@@ -9991,11 +10909,13 @@ void unregister_netdev(struct net_device *dev)
EXPORT_SYMBOL(unregister_netdev);
/**
- * dev_change_net_namespace - move device to different nethost namespace
+ * __dev_change_net_namespace - move device to different nethost namespace
* @dev: device
* @net: network namespace
* @pat: If not NULL name pattern to try if the current device name
* is already taken in the destination network namespace.
+ * @new_ifindex: If not zero, specifies device index in the target
+ * namespace.
*
* This function shuts down a device interface and moves it
* to a new network namespace. On success 0 is returned, on
@@ -10004,9 +10924,11 @@ EXPORT_SYMBOL(unregister_netdev);
* Callers must hold the rtnl semaphore.
*/
-int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+int __dev_change_net_namespace(struct net_device *dev, struct net *net,
+ const char *pat, int new_ifindex)
{
- int err, new_nsid, new_ifindex;
+ struct net *net_old = dev_net(dev);
+ int err, new_nsid;
ASSERT_RTNL();
@@ -10021,14 +10943,14 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
/* Get out if there is nothing todo */
err = 0;
- if (net_eq(dev_net(dev), net))
+ if (net_eq(net_old, net))
goto out;
/* Pick the destination device name, and ensure
* we can use it in the destination network namespace.
*/
err = -EEXIST;
- if (__dev_get_by_name(net, dev->name)) {
+ if (netdev_name_in_use(net, dev->name)) {
/* We get here if we can't use the current device name */
if (!pat)
goto out;
@@ -10037,6 +10959,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
goto out;
}
+ /* Check that new_ifindex isn't used yet. */
+ err = -EBUSY;
+ if (new_ifindex && __dev_get_by_index(net, new_ifindex))
+ goto out;
+
/*
* And now a mini version of register_netdevice unregister_netdevice.
*/
@@ -10045,7 +10972,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
dev_close(dev);
/* And unlink it from device chain */
- unlist_netdevice(dev);
+ unlist_netdevice(dev, true);
synchronize_net();
@@ -10064,10 +10991,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
/* If there is an ifindex conflict assign a new one */
- if (__dev_get_by_index(net, dev->ifindex))
- new_ifindex = dev_new_index(net);
- else
- new_ifindex = dev->ifindex;
+ if (!new_ifindex) {
+ if (__dev_get_by_index(net, dev->ifindex))
+ new_ifindex = dev_new_index(net);
+ else
+ new_ifindex = dev->ifindex;
+ }
rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
new_ifindex);
@@ -10097,6 +11026,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
err = device_rename(&dev->dev, dev->name);
WARN_ON(err);
+ /* Adapt owner in case owning user namespace of target network
+ * namespace is different from the original one.
+ */
+ err = netdev_change_owner(dev, net_old, net);
+ WARN_ON(err);
+
/* Add the device back in the hashes */
list_netdevice(dev);
@@ -10114,7 +11049,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
out:
return err;
}
-EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
static int dev_cpu_dead(unsigned int oldcpu)
{
@@ -10171,11 +11106,11 @@ static int dev_cpu_dead(unsigned int oldcpu)
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
- netif_rx_ni(skb);
+ netif_rx(skb);
input_queue_head_incr(oldsd);
}
while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
- netif_rx_ni(skb);
+ netif_rx(skb);
input_queue_head_incr(oldsd);
}
@@ -10229,8 +11164,7 @@ static int __net_init netdev_init(struct net *net)
BUILD_BUG_ON(GRO_HASH_BUCKETS >
8 * sizeof_field(struct napi_struct, gro_bitmask));
- if (net != &init_net)
- INIT_LIST_HEAD(&net->dev_base_head);
+ INIT_LIST_HEAD(&net->dev_base_head);
net->dev_name_head = netdev_create_hash();
if (net->dev_name_head == NULL)
@@ -10346,14 +11280,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
.exit = netdev_exit,
};
-static void __net_exit default_device_exit(struct net *net)
+static void __net_exit default_device_exit_net(struct net *net)
{
struct net_device *dev, *aux;
/*
* Push all migratable network devices back to the
* initial network namespace
*/
- rtnl_lock();
+ ASSERT_RTNL();
for_each_netdev_safe(net, dev, aux) {
int err;
char fb_name[IFNAMSIZ];
@@ -10363,12 +11297,12 @@ static void __net_exit default_device_exit(struct net *net)
continue;
/* Leave virtual devices for the generic cleanup */
- if (dev->rtnl_link_ops)
+ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
continue;
/* Push remaining network devices to init_net */
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
- if (__dev_get_by_name(&init_net, fb_name))
+ if (netdev_name_in_use(&init_net, fb_name))
snprintf(fb_name, IFNAMSIZ, "dev%%d");
err = dev_change_net_namespace(dev, &init_net, fb_name);
if (err) {
@@ -10377,35 +11311,6 @@ static void __net_exit default_device_exit(struct net *net)
BUG();
}
}
- rtnl_unlock();
-}
-
-static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
-{
- /* Return with the rtnl_lock held when there are no network
- * devices unregistering in any network namespace in net_list.
- */
- struct net *net;
- bool unregistering;
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
- add_wait_queue(&netdev_unregistering_wq, &wait);
- for (;;) {
- unregistering = false;
- rtnl_lock();
- list_for_each_entry(net, net_list, exit_list) {
- if (net->dev_unreg_count > 0) {
- unregistering = true;
- break;
- }
- }
- if (!unregistering)
- break;
- __rtnl_unlock();
-
- wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
- }
- remove_wait_queue(&netdev_unregistering_wq, &wait);
}
static void __net_exit default_device_exit_batch(struct list_head *net_list)
@@ -10419,18 +11324,12 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
struct net *net;
LIST_HEAD(dev_kill_list);
- /* To prevent network device cleanup code from dereferencing
- * loopback devices or network devices that have been freed
- * wait here for all pending unregistrations to complete,
- * before unregistring the loopback device and allowing the
- * network namespace be freed.
- *
- * The netdev todo list containing all network devices
- * unregistrations that happen in default_device_exit_batch
- * will run in the rtnl_unlock() at the end of
- * default_device_exit_batch.
- */
- rtnl_lock_unregistering(net_list);
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ default_device_exit_net(net);
+ cond_resched();
+ }
+
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
@@ -10444,7 +11343,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
static struct pernet_operations __net_initdata default_device_ops = {
- .exit = default_device_exit,
.exit_batch = default_device_exit_batch,
};
@@ -10475,8 +11373,6 @@ static int __init net_dev_init(void)
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
- INIT_LIST_HEAD(&offload_base);
-
if (register_pernet_subsys(&netdev_net_ops))
goto out;
@@ -10498,10 +11394,11 @@ static int __init net_dev_init(void)
INIT_LIST_HEAD(&sd->poll_list);
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
- sd->csd.func = rps_trigger_softirq;
- sd->csd.info = sd;
+ INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
sd->cpu = i;
#endif
+ INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
+ spin_lock_init(&sd->defer_lock);
init_gro_hash(&sd->backlog);
sd->backlog.poll = process_backlog;
diff --git a/net/core/dev.h b/net/core/dev.h
new file mode 100644
index 000000000000..cbb8a925175a
--- /dev/null
+++ b/net/core/dev.h
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_DEV_H
+#define _NET_CORE_DEV_H
+
+#include <linux/types.h>
+
+struct net;
+struct net_device;
+struct netdev_bpf;
+struct netdev_phys_item_id;
+struct netlink_ext_ack;
+
+/* Random bits of netdevice that don't need to be exposed */
+#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
+struct sd_flow_limit {
+ u64 count;
+ unsigned int num_buckets;
+ unsigned int history_head;
+ u16 history[FLOW_LIMIT_HISTORY];
+ u8 buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+
+#ifdef CONFIG_PROC_FS
+int __init dev_proc_init(void);
+#else
+#define dev_proc_init() 0
+#endif
+
+void linkwatch_init_dev(struct net_device *dev);
+void linkwatch_forget_dev(struct net_device *dev);
+void linkwatch_run_queue(void);
+
+void dev_addr_flush(struct net_device *dev);
+int dev_addr_init(struct net_device *dev);
+void dev_addr_check(struct net_device *dev);
+
+/* sysctls not referred to from outside net/core/ */
+extern int netdev_budget;
+extern unsigned int netdev_budget_usecs;
+extern unsigned int sysctl_skb_defer_max;
+extern int netdev_tstamp_prequeue;
+extern int netdev_unregister_timeout_secs;
+extern int weight_p;
+extern int dev_weight_rx_bias;
+extern int dev_weight_tx_bias;
+
+/* rtnl helpers */
+extern struct list_head net_todo_list;
+void netdev_run_todo(void);
+
+/* netdev management, shared between various uAPI entry points */
+struct netdev_name_node {
+ struct hlist_node hlist;
+ struct list_head list;
+ struct net_device *dev;
+ const char *name;
+};
+
+int netdev_get_name(struct net *net, char *name, int ifindex);
+int dev_change_name(struct net_device *dev, const char *newname);
+
+int netdev_name_node_alt_create(struct net_device *dev, const char *name);
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
+
+int dev_validate_mtu(struct net_device *dev, int mtu,
+ struct netlink_ext_ack *extack);
+int dev_set_mtu_ext(struct net_device *dev, int mtu,
+ struct netlink_ext_ack *extack);
+
+int dev_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid);
+int dev_get_phys_port_name(struct net_device *dev,
+ char *name, size_t len);
+
+int dev_change_proto_down(struct net_device *dev, bool proto_down);
+void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
+ u32 value);
+
+typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, int expected_fd, u32 flags);
+
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
+void dev_set_group(struct net_device *dev, int new_group);
+int dev_change_carrier(struct net_device *dev, bool new_carrier);
+
+void __dev_set_rx_mode(struct net_device *dev);
+
+static inline void netif_set_gso_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* dev->gso_max_size is read locklessly from sk_setup_caps() */
+ WRITE_ONCE(dev->gso_max_size, size);
+}
+
+static inline void netif_set_gso_max_segs(struct net_device *dev,
+ unsigned int segs)
+{
+ /* dev->gso_max_segs is read locklessly from sk_setup_caps() */
+ WRITE_ONCE(dev->gso_max_segs, segs);
+}
+
+static inline void netif_set_gro_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* This pairs with the READ_ONCE() in skb_gro_receive() */
+ WRITE_ONCE(dev->gro_max_size, size);
+}
+
+#endif
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 2f949b5a1eb9..baa63dee2829 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -12,14 +12,44 @@
#include <linux/export.h>
#include <linux/list.h>
+#include "dev.h"
+
/*
* General list handling functions
*/
-static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
- const unsigned char *addr, int addr_len,
- unsigned char addr_type, bool global,
- bool sync)
+static int __hw_addr_insert(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *new, int addr_len)
+{
+ struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
+ struct netdev_hw_addr *ha;
+
+ while (*ins_point) {
+ int diff;
+
+ ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+ diff = memcmp(new->addr, ha->addr, addr_len);
+ if (diff == 0)
+ diff = memcmp(&new->type, &ha->type, sizeof(new->type));
+
+ parent = *ins_point;
+ if (diff < 0)
+ ins_point = &parent->rb_left;
+ else if (diff > 0)
+ ins_point = &parent->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node_rcu(&new->node, parent, ins_point);
+ rb_insert_color(&new->node, &list->tree);
+
+ return 0;
+}
+
+static struct netdev_hw_addr*
+__hw_addr_create(const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync)
{
struct netdev_hw_addr *ha;
int alloc_size;
@@ -29,32 +59,44 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
alloc_size = L1_CACHE_BYTES;
ha = kmalloc(alloc_size, GFP_ATOMIC);
if (!ha)
- return -ENOMEM;
+ return NULL;
memcpy(ha->addr, addr, addr_len);
ha->type = addr_type;
ha->refcount = 1;
ha->global_use = global;
ha->synced = sync ? 1 : 0;
ha->sync_cnt = 0;
- list_add_tail_rcu(&ha->list, &list->list);
- list->count++;
- return 0;
+ return ha;
}
static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global, bool sync,
- int sync_count)
+ int sync_count, bool exclusive)
{
+ struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
struct netdev_hw_addr *ha;
if (addr_len > MAX_ADDR_LEN)
return -EINVAL;
- list_for_each_entry(ha, &list->list, list) {
- if (ha->type == addr_type &&
- !memcmp(ha->addr, addr, addr_len)) {
+ while (*ins_point) {
+ int diff;
+
+ ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+ diff = memcmp(addr, ha->addr, addr_len);
+ if (diff == 0)
+ diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+ parent = *ins_point;
+ if (diff < 0) {
+ ins_point = &parent->rb_left;
+ } else if (diff > 0) {
+ ins_point = &parent->rb_right;
+ } else {
+ if (exclusive)
+ return -EEXIST;
if (global) {
/* check if addr is already used as global */
if (ha->global_use)
@@ -73,8 +115,17 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
}
}
- return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
- sync);
+ ha = __hw_addr_create(addr, addr_len, addr_type, global, sync);
+ if (!ha)
+ return -ENOMEM;
+
+ rb_link_node(&ha->node, parent, ins_point);
+ rb_insert_color(&ha->node, &list->tree);
+
+ list_add_tail_rcu(&ha->list, &list->list);
+ list->count++;
+
+ return 0;
}
static int __hw_addr_add(struct netdev_hw_addr_list *list,
@@ -82,7 +133,7 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list,
unsigned char addr_type)
{
return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
- 0);
+ 0, false);
}
static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
@@ -103,24 +154,50 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
if (--ha->refcount)
return 0;
+
+ rb_erase(&ha->node, &list->tree);
+
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
list->count--;
return 0;
}
+static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ struct rb_node *node;
+
+ node = list->tree.rb_node;
+
+ while (node) {
+ struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node);
+ int diff = memcmp(addr, ha->addr, addr_len);
+
+ if (diff == 0 && addr_type)
+ diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+ if (diff < 0)
+ node = node->rb_left;
+ else if (diff > 0)
+ node = node->rb_right;
+ else
+ return ha;
+ }
+
+ return NULL;
+}
+
static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global, bool sync)
{
- struct netdev_hw_addr *ha;
+ struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type);
- list_for_each_entry(ha, &list->list, list) {
- if (!memcmp(ha->addr, addr, addr_len) &&
- (ha->type == addr_type || !addr_type))
- return __hw_addr_del_entry(list, ha, global, sync);
- }
- return -ENOENT;
+ if (!ha)
+ return -ENOENT;
+ return __hw_addr_del_entry(list, ha, global, sync);
}
static int __hw_addr_del(struct netdev_hw_addr_list *list,
@@ -137,7 +214,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
int err;
err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
- false, true, ha->sync_cnt);
+ false, true, ha->sync_cnt, false);
if (err && err != -EEXIST)
return err;
@@ -228,7 +305,7 @@ EXPORT_SYMBOL(__hw_addr_unsync);
* @sync: function to call if address should be added
* @unsync: function to call if address should be removed
*
- * This funciton is intended to be called from the ndo_set_rx_mode
+ * This function is intended to be called from the ndo_set_rx_mode
* function of devices that require explicit address add/remove
* notifications. The unsync function may be NULL in which case
* the addresses requiring removal will simply be removed without
@@ -407,6 +484,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
struct netdev_hw_addr *ha, *tmp;
+ list->tree = RB_ROOT;
list_for_each_entry_safe(ha, tmp, &list->list, list) {
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
@@ -418,6 +496,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list)
{
INIT_LIST_HEAD(&list->list);
list->count = 0;
+ list->tree = RB_ROOT;
}
EXPORT_SYMBOL(__hw_addr_init);
@@ -425,6 +504,21 @@ EXPORT_SYMBOL(__hw_addr_init);
* Device addresses handling functions
*/
+/* Check that netdev->dev_addr is not written to directly as this would
+ * break the rbtree layout. All changes should go thru dev_addr_set() and co.
+ * Remove this check in mid-2024.
+ */
+void dev_addr_check(struct net_device *dev)
+{
+ if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN))
+ return;
+
+ netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr);
+ netdev_warn(dev, "Expected addr: %*ph\n",
+ MAX_ADDR_LEN, dev->dev_addr_shadow);
+ netdev_WARN(dev, "Incorrect netdev->dev_addr\n");
+}
+
/**
* dev_addr_flush - Flush device address list
* @dev: device
@@ -436,11 +530,11 @@ EXPORT_SYMBOL(__hw_addr_init);
void dev_addr_flush(struct net_device *dev)
{
/* rtnl_mutex must be held here */
+ dev_addr_check(dev);
__hw_addr_flush(&dev->dev_addrs);
dev->dev_addr = NULL;
}
-EXPORT_SYMBOL(dev_addr_flush);
/**
* dev_addr_init - Init device address list
@@ -474,7 +568,21 @@ int dev_addr_init(struct net_device *dev)
}
return err;
}
-EXPORT_SYMBOL(dev_addr_init);
+
+void dev_addr_mod(struct net_device *dev, unsigned int offset,
+ const void *addr, size_t len)
+{
+ struct netdev_hw_addr *ha;
+
+ dev_addr_check(dev);
+
+ ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]);
+ rb_erase(&ha->node, &dev->dev_addrs.tree);
+ memcpy(&ha->addr[offset], addr, len);
+ memcpy(&dev->dev_addr_shadow[offset], addr, len);
+ WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len));
+}
+EXPORT_SYMBOL(dev_addr_mod);
/**
* dev_addr_add - Add a device address
@@ -552,22 +660,14 @@ EXPORT_SYMBOL(dev_addr_del);
*/
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
{
- struct netdev_hw_addr *ha;
int err;
netif_addr_lock_bh(dev);
- list_for_each_entry(ha, &dev->uc.list, list) {
- if (!memcmp(ha->addr, addr, dev->addr_len) &&
- ha->type == NETDEV_HW_ADDR_T_UNICAST) {
- err = -EEXIST;
- goto out;
- }
- }
- err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_UNICAST, true, false);
+ err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST, true, false,
+ 0, true);
if (!err)
__dev_set_rx_mode(dev);
-out:
netif_addr_unlock_bh(dev);
return err;
}
@@ -690,6 +790,15 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return;
+ /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two
+ * reasons:
+ * 1) This is always called without any addr_list_lock, so as the
+ * outermost one here, it must be 0.
+ * 2) This is called by some callers after unlinking the upper device,
+ * so the dev->lower_level becomes 1 again.
+ * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or
+ * larger.
+ */
netif_addr_lock_bh(from);
netif_addr_lock(to);
__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
@@ -714,7 +823,7 @@ void dev_uc_flush(struct net_device *dev)
EXPORT_SYMBOL(dev_uc_flush);
/**
- * dev_uc_flush - Init unicast address list
+ * dev_uc_init - Init unicast address list
* @dev: device
*
* Init unicast address list.
@@ -736,22 +845,14 @@ EXPORT_SYMBOL(dev_uc_init);
*/
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
{
- struct netdev_hw_addr *ha;
int err;
netif_addr_lock_bh(dev);
- list_for_each_entry(ha, &dev->mc.list, list) {
- if (!memcmp(ha->addr, addr, dev->addr_len) &&
- ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
- err = -EEXIST;
- goto out;
- }
- }
- err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_MULTICAST, true, false);
+ err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, true, false,
+ 0, true);
if (!err)
__dev_set_rx_mode(dev);
-out:
netif_addr_unlock_bh(dev);
return err;
}
@@ -764,7 +865,8 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
netif_addr_lock_bh(dev);
err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
+ NETDEV_HW_ADDR_T_MULTICAST, global, false,
+ 0, false);
if (!err)
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
@@ -911,6 +1013,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return;
+ /* See the above comments inside dev_uc_unsync(). */
netif_addr_lock_bh(from);
netif_addr_lock(to);
__hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c
new file mode 100644
index 000000000000..049cfbc58aa9
--- /dev/null
+++ b/net/core/dev_addr_lists_test.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <kunit/test.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+
+static const struct net_device_ops dummy_netdev_ops = {
+};
+
+struct dev_addr_test_priv {
+ u32 addr_seen;
+};
+
+static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ datp->addr_seen |= 1 << a[0];
+ return 0;
+}
+
+static int dev_addr_test_unsync(struct net_device *netdev,
+ const unsigned char *a)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ datp->addr_seen &= ~(1 << a[0]);
+ return 0;
+}
+
+static int dev_addr_test_init(struct kunit *test)
+{
+ struct dev_addr_test_priv *datp;
+ struct net_device *netdev;
+ int err;
+
+ netdev = alloc_etherdev(sizeof(*datp));
+ KUNIT_ASSERT_TRUE(test, !!netdev);
+
+ test->priv = netdev;
+ netdev->netdev_ops = &dummy_netdev_ops;
+
+ err = register_netdev(netdev);
+ if (err) {
+ free_netdev(netdev);
+ KUNIT_FAIL(test, "Can't register netdev %d", err);
+ }
+
+ rtnl_lock();
+ return 0;
+}
+
+static void dev_addr_test_exit(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+
+ rtnl_unlock();
+ unregister_netdev(netdev);
+ free_netdev(netdev);
+}
+
+static void dev_addr_test_basic(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+
+ KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr);
+
+ memset(addr, 2, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+ KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr)));
+
+ memset(addr, 3, sizeof(addr));
+ dev_addr_set(netdev, addr);
+ KUNIT_EXPECT_EQ(test, 0, memcmp(netdev->dev_addr, addr, sizeof(addr)));
+}
+
+static void dev_addr_test_sync_one(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ memset(addr, 1, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 2, datp->addr_seen);
+
+ memset(addr, 2, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ datp->addr_seen = 0;
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ /* It's not going to sync anything because the main address is
+ * considered synced and we overwrite in place.
+ */
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_seen);
+}
+
+static void dev_addr_test_add_del(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ datp = netdev_priv(netdev);
+
+ for (i = 1; i < 4; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+ /* Add 3 again */
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen);
+
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen);
+
+ for (i = 1; i < 4; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1, datp->addr_seen);
+}
+
+static void dev_addr_test_del_main(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+
+ memset(addr, 1, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+}
+
+static void dev_addr_test_add_set(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ datp = netdev_priv(netdev);
+
+ /* There is no external API like dev_addr_add_excl(),
+ * so shuffle the tree a little bit and exploit aliasing.
+ */
+ for (i = 1; i < 16; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+
+ memset(addr, i, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ memset(addr, 0, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen);
+}
+
+static void dev_addr_test_add_excl(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr));
+ }
+ KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
+
+ for (i = 0; i < 10; i += 2) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+ }
+ for (i = 1; i < 10; i += 2) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
+ }
+}
+
+static struct kunit_case dev_addr_test_cases[] = {
+ KUNIT_CASE(dev_addr_test_basic),
+ KUNIT_CASE(dev_addr_test_sync_one),
+ KUNIT_CASE(dev_addr_test_add_del),
+ KUNIT_CASE(dev_addr_test_del_main),
+ KUNIT_CASE(dev_addr_test_add_set),
+ KUNIT_CASE(dev_addr_test_add_excl),
+ {}
+};
+
+static struct kunit_suite dev_addr_test_suite = {
+ .name = "dev-addr-list-test",
+ .test_cases = dev_addr_test_cases,
+ .init = dev_addr_test_init,
+ .exit = dev_addr_test_exit,
+};
+kunit_test_suite(dev_addr_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index dbaebbe573f0..7674bb9f3076 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -1,12 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kmod.h>
#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/net_tstamp.h>
#include <linux/wireless.h>
+#include <linux/if_bridge.h>
+#include <net/dsa.h>
#include <net/wext.h>
+#include "dev.h"
+
/*
* Map an interface index to its name (SIOCGIFNAME)
*/
@@ -24,79 +29,108 @@ static int dev_ifname(struct net *net, struct ifreq *ifr)
return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex);
}
-static gifconf_func_t *gifconf_list[NPROTO];
-
-/**
- * register_gifconf - register a SIOCGIF handler
- * @family: Address family
- * @gifconf: Function handler
- *
- * Register protocol dependent address dumping routines. The handler
- * that is passed must not be freed or reused until it has been replaced
- * by another handler.
- */
-int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
-{
- if (family >= NPROTO)
- return -EINVAL;
- gifconf_list[family] = gifconf;
- return 0;
-}
-EXPORT_SYMBOL(register_gifconf);
-
/*
* Perform a SIOCGIFCONF call. This structure will change
* size eventually, and there is nothing I can do about it.
* Thus we will need a 'compatibility mode'.
*/
-
-int dev_ifconf(struct net *net, struct ifconf *ifc, int size)
+int dev_ifconf(struct net *net, struct ifconf __user *uifc)
{
struct net_device *dev;
- char __user *pos;
- int len;
- int total;
- int i;
+ void __user *pos;
+ size_t size;
+ int len, total = 0, done;
- /*
- * Fetch the caller's info block.
- */
+ /* both the ifconf and the ifreq structures are slightly different */
+ if (in_compat_syscall()) {
+ struct compat_ifconf ifc32;
- pos = ifc->ifc_buf;
- len = ifc->ifc_len;
+ if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf)))
+ return -EFAULT;
- /*
- * Loop over the interfaces, and write an info block for each.
- */
+ pos = compat_ptr(ifc32.ifcbuf);
+ len = ifc32.ifc_len;
+ size = sizeof(struct compat_ifreq);
+ } else {
+ struct ifconf ifc;
+
+ if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ pos = ifc.ifc_buf;
+ len = ifc.ifc_len;
+ size = sizeof(struct ifreq);
+ }
- total = 0;
+ /* Loop over the interfaces, and write an info block for each. */
+ rtnl_lock();
for_each_netdev(net, dev) {
- for (i = 0; i < NPROTO; i++) {
- if (gifconf_list[i]) {
- int done;
- if (!pos)
- done = gifconf_list[i](dev, NULL, 0, size);
- else
- done = gifconf_list[i](dev, pos + total,
- len - total, size);
- if (done < 0)
- return -EFAULT;
- total += done;
- }
+ if (!pos)
+ done = inet_gifconf(dev, NULL, 0, size);
+ else
+ done = inet_gifconf(dev, pos + total,
+ len - total, size);
+ if (done < 0) {
+ rtnl_unlock();
+ return -EFAULT;
}
+ total += done;
}
+ rtnl_unlock();
- /*
- * All done. Write the updated control block back to the caller.
- */
- ifc->ifc_len = total;
+ return put_user(total, &uifc->ifc_len);
+}
+
+static int dev_getifmap(struct net_device *dev, struct ifreq *ifr)
+{
+ struct ifmap *ifmap = &ifr->ifr_map;
+
+ if (in_compat_syscall()) {
+ struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap;
+
+ cifmap->mem_start = dev->mem_start;
+ cifmap->mem_end = dev->mem_end;
+ cifmap->base_addr = dev->base_addr;
+ cifmap->irq = dev->irq;
+ cifmap->dma = dev->dma;
+ cifmap->port = dev->if_port;
+
+ return 0;
+ }
+
+ ifmap->mem_start = dev->mem_start;
+ ifmap->mem_end = dev->mem_end;
+ ifmap->base_addr = dev->base_addr;
+ ifmap->irq = dev->irq;
+ ifmap->dma = dev->dma;
+ ifmap->port = dev->if_port;
- /*
- * Both BSD and Solaris return 0 here, so we do too.
- */
return 0;
}
+static int dev_setifmap(struct net_device *dev, struct ifreq *ifr)
+{
+ struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map;
+
+ if (!dev->netdev_ops->ndo_set_config)
+ return -EOPNOTSUPP;
+
+ if (in_compat_syscall()) {
+ struct ifmap ifmap = {
+ .mem_start = cifmap->mem_start,
+ .mem_end = cifmap->mem_end,
+ .base_addr = cifmap->base_addr,
+ .irq = cifmap->irq,
+ .dma = cifmap->dma,
+ .port = cifmap->port,
+ };
+
+ return dev->netdev_ops->ndo_set_config(dev, &ifmap);
+ }
+
+ return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map);
+}
+
/*
* Perform the SIOCxIFxxx calls, inside rcu_read_lock()
*/
@@ -122,29 +156,12 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
ifr->ifr_mtu = dev->mtu;
return 0;
- case SIOCGIFHWADDR:
- if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0,
- sizeof(ifr->ifr_hwaddr.sa_data));
- else
- memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof(ifr->ifr_hwaddr.sa_data),
- (size_t)dev->addr_len));
- ifr->ifr_hwaddr.sa_family = dev->type;
- return 0;
-
case SIOCGIFSLAVE:
err = -EINVAL;
break;
case SIOCGIFMAP:
- ifr->ifr_map.mem_start = dev->mem_start;
- ifr->ifr_map.mem_end = dev->mem_end;
- ifr->ifr_map.base_addr = dev->base_addr;
- ifr->ifr_map.irq = dev->irq;
- ifr->ifr_map.dma = dev->dma;
- ifr->ifr_map.port = dev->if_port;
- return 0;
+ return dev_getifmap(dev, ifr);
case SIOCGIFINDEX:
ifr->ifr_ifindex = dev->ifindex;
@@ -177,7 +194,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
return -EFAULT;
- if (cfg.flags) /* reserved for future extensions */
+ if (cfg.flags & ~HWTSTAMP_FLAG_MASK)
return -EINVAL;
tx_type = cfg.tx_type;
@@ -190,6 +207,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_TX_ONESTEP_P2P:
tx_type_valid = 1;
break;
+ case __HWTSTAMP_TX_CNT:
+ /* not a real value */
+ break;
}
switch (rx_filter) {
@@ -211,6 +231,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_FILTER_NTP_ALL:
rx_filter_valid = 1;
break;
+ case __HWTSTAMP_FILTER_CNT:
+ /* not a real value */
+ break;
}
if (!tx_type_valid || !rx_filter_valid)
@@ -219,14 +242,80 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
return 0;
}
+static int dev_eth_ioctl(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
+
+ err = dsa_ndo_eth_ioctl(dev, ifr, cmd);
+ if (err == 0 || err != -EOPNOTSUPP)
+ return err;
+
+ if (ops->ndo_eth_ioctl) {
+ if (netif_device_present(dev))
+ err = ops->ndo_eth_ioctl(dev, ifr, cmd);
+ else
+ err = -ENODEV;
+ }
+
+ return err;
+}
+
+static int dev_siocbond(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocbond) {
+ if (netif_device_present(dev))
+ return ops->ndo_siocbond(dev, ifr, cmd);
+ else
+ return -ENODEV;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocdevprivate) {
+ if (netif_device_present(dev))
+ return ops->ndo_siocdevprivate(dev, ifr, data, cmd);
+ else
+ return -ENODEV;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocwandev) {
+ if (netif_device_present(dev))
+ return ops->ndo_siocwandev(dev, ifs);
+ else
+ return -ENODEV;
+ }
+
+ return -EOPNOTSUPP;
+}
+
/*
* Perform the SIOCxIFxxx calls, inside rtnl_lock()
*/
-static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
+ unsigned int cmd)
{
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
const struct net_device_ops *ops;
+ netdevice_tracker dev_tracker;
if (!dev)
return -ENODEV;
@@ -247,7 +336,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
case SIOCSIFHWADDR:
if (dev->addr_len > sizeof(struct sockaddr))
return -EINVAL;
- return dev_set_mac_address(dev, &ifr->ifr_hwaddr, NULL);
+ return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL);
case SIOCSIFHWBROADCAST:
if (ifr->ifr_hwaddr.sa_family != dev->type)
@@ -259,12 +348,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
return 0;
case SIOCSIFMAP:
- if (ops->ndo_set_config) {
- if (!netif_device_present(dev))
- return -ENODEV;
- return ops->ndo_set_config(dev, &ifr->ifr_map);
- }
- return -EOPNOTSUPP;
+ return dev_setifmap(dev, ifr);
case SIOCADDMULTI:
if (!ops->ndo_set_rx_mode ||
@@ -291,39 +375,49 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
ifr->ifr_newname[IFNAMSIZ-1] = '\0';
return dev_change_name(dev, ifr->ifr_newname);
+ case SIOCWANDEV:
+ return dev_siocwandev(dev, &ifr->ifr_settings);
+
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ if (!netif_is_bridge_master(dev))
+ return -EOPNOTSUPP;
+ netdev_hold(dev, &dev_tracker, GFP_KERNEL);
+ rtnl_unlock();
+ err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL);
+ netdev_put(dev, &dev_tracker);
+ rtnl_lock();
+ return err;
+
case SIOCSHWTSTAMP:
err = net_hwtstamp_validate(ifr);
if (err)
return err;
- /* fall through */
+ fallthrough;
/*
* Unknown or private ioctl
*/
default:
- if ((cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15) ||
- cmd == SIOCBONDENSLAVE ||
+ if (cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15)
+ return dev_siocdevprivate(dev, ifr, data, cmd);
+
+ if (cmd == SIOCGMIIPHY ||
+ cmd == SIOCGMIIREG ||
+ cmd == SIOCSMIIREG ||
+ cmd == SIOCSHWTSTAMP ||
+ cmd == SIOCGHWTSTAMP) {
+ err = dev_eth_ioctl(dev, ifr, cmd);
+ } else if (cmd == SIOCBONDENSLAVE ||
cmd == SIOCBONDRELEASE ||
cmd == SIOCBONDSETHWADDR ||
cmd == SIOCBONDSLAVEINFOQUERY ||
cmd == SIOCBONDINFOQUERY ||
- cmd == SIOCBONDCHANGEACTIVE ||
- cmd == SIOCGMIIPHY ||
- cmd == SIOCGMIIREG ||
- cmd == SIOCSMIIREG ||
- cmd == SIOCBRADDIF ||
- cmd == SIOCBRDELIF ||
- cmd == SIOCSHWTSTAMP ||
- cmd == SIOCGHWTSTAMP ||
- cmd == SIOCWANDEV) {
- err = -EOPNOTSUPP;
- if (ops->ndo_do_ioctl) {
- if (netif_device_present(dev))
- err = ops->ndo_do_ioctl(dev, ifr, cmd);
- else
- err = -ENODEV;
- }
+ cmd == SIOCBONDCHANGEACTIVE) {
+ err = dev_siocbond(dev, ifr, cmd);
} else
err = -EINVAL;
@@ -376,7 +470,8 @@ EXPORT_SYMBOL(dev_load);
* positive or a negative errno code on error.
*/
-int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout)
+int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
+ void __user *data, bool *need_copyout)
{
int ret;
char *colon;
@@ -397,6 +492,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
*/
switch (cmd) {
+ case SIOCGIFHWADDR:
+ dev_load(net, ifr->ifr_name);
+ ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name);
+ if (colon)
+ *colon = ':';
+ return ret;
/*
* These ioctl calls:
* - can be done by all.
@@ -406,7 +507,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCGIFFLAGS:
case SIOCGIFMETRIC:
case SIOCGIFMTU:
- case SIOCGIFHWADDR:
case SIOCGIFSLAVE:
case SIOCGIFMAP:
case SIOCGIFINDEX:
@@ -421,9 +521,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCETHTOOL:
dev_load(net, ifr->ifr_name);
- rtnl_lock();
- ret = dev_ethtool(net, ifr);
- rtnl_unlock();
+ ret = dev_ethtool(net, ifr, data);
if (colon)
*colon = ':';
return ret;
@@ -441,7 +539,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
rtnl_lock();
- ret = dev_ifsioc(net, ifr, cmd);
+ ret = dev_ifsioc(net, ifr, data, cmd);
rtnl_unlock();
if (colon)
*colon = ':';
@@ -457,7 +555,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCSIFTXQLEN:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- /* fall through */
+ fallthrough;
/*
* These ioctl calls:
* - require local superuser power.
@@ -482,12 +580,12 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCSHWTSTAMP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- /* fall through */
+ fallthrough;
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
dev_load(net, ifr->ifr_name);
rtnl_lock();
- ret = dev_ifsioc(net, ifr, cmd);
+ ret = dev_ifsioc(net, ifr, data, cmd);
rtnl_unlock();
if (need_copyout)
*need_copyout = false;
@@ -512,7 +610,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
cmd <= SIOCDEVPRIVATE + 15)) {
dev_load(net, ifr->ifr_name);
rtnl_lock();
- ret = dev_ifsioc(net, ifr, cmd);
+ ret = dev_ifsioc(net, ifr, data, cmd);
rtnl_unlock();
return ret;
}
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b831c5545d6a..89baa7c0938b 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -7,6 +7,7 @@
* Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
*/
+#include <linux/etherdevice.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -27,10 +28,117 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/devlink.h>
-#include <net/drop_monitor.h>
#define CREATE_TRACE_POINTS
#include <trace/events/devlink.h>
+#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \
+ (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX)
+
+struct devlink_dev_stats {
+ u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+ u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+};
+
+struct devlink {
+ u32 index;
+ struct list_head port_list;
+ struct list_head rate_list;
+ struct list_head sb_list;
+ struct list_head dpipe_table_list;
+ struct list_head resource_list;
+ struct list_head param_list;
+ struct list_head region_list;
+ struct list_head reporter_list;
+ struct mutex reporters_lock; /* protects reporter_list */
+ struct devlink_dpipe_headers *dpipe_headers;
+ struct list_head trap_list;
+ struct list_head trap_group_list;
+ struct list_head trap_policer_list;
+ struct list_head linecard_list;
+ struct mutex linecards_lock; /* protects linecard_list */
+ const struct devlink_ops *ops;
+ u64 features;
+ struct xarray snapshot_ids;
+ struct devlink_dev_stats stats;
+ struct device *dev;
+ possible_net_t _net;
+ /* Serializes access to devlink instance specific objects such as
+ * port, sb, dpipe, resource, params, region, traps and more.
+ */
+ struct mutex lock;
+ struct lock_class_key lock_key;
+ u8 reload_failed:1;
+ refcount_t refcount;
+ struct completion comp;
+ struct rcu_head rcu;
+ char priv[] __aligned(NETDEV_ALIGN);
+};
+
+struct devlink_linecard_ops;
+struct devlink_linecard_type;
+
+struct devlink_linecard {
+ struct list_head list;
+ struct devlink *devlink;
+ unsigned int index;
+ refcount_t refcount;
+ const struct devlink_linecard_ops *ops;
+ void *priv;
+ enum devlink_linecard_state state;
+ struct mutex state_lock; /* Protects state */
+ const char *type;
+ struct devlink_linecard_type *types;
+ unsigned int types_count;
+ struct devlink *nested_devlink;
+};
+
+/**
+ * struct devlink_resource - devlink resource
+ * @name: name of the resource
+ * @id: id, per devlink instance
+ * @size: size of the resource
+ * @size_new: updated size of the resource, reload is needed
+ * @size_valid: valid in case the total size of the resource is valid
+ * including its children
+ * @parent: parent resource
+ * @size_params: size parameters
+ * @list: parent list
+ * @resource_list: list of child resources
+ * @occ_get: occupancy getter callback
+ * @occ_get_priv: occupancy getter callback priv
+ */
+struct devlink_resource {
+ const char *name;
+ u64 id;
+ u64 size;
+ u64 size_new;
+ bool size_valid;
+ struct devlink_resource *parent;
+ struct devlink_resource_size_params size_params;
+ struct list_head list;
+ struct list_head resource_list;
+ devlink_resource_occ_get_t *occ_get;
+ void *occ_get_priv;
+};
+
+void *devlink_priv(struct devlink *devlink)
+{
+ return &devlink->priv;
+}
+EXPORT_SYMBOL_GPL(devlink_priv);
+
+struct devlink *priv_to_devlink(void *priv)
+{
+ return container_of(priv, struct devlink, priv);
+}
+EXPORT_SYMBOL_GPL(priv_to_devlink);
+
+struct device *devlink_to_dev(const struct devlink *devlink)
+{
+ return devlink->dev;
+}
+EXPORT_SYMBOL_GPL(devlink_to_dev);
+
static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = {
{
.name = "destination mac",
@@ -46,7 +154,7 @@ struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet),
.global = true,
};
-EXPORT_SYMBOL(devlink_dpipe_header_ethernet);
+EXPORT_SYMBOL_GPL(devlink_dpipe_header_ethernet);
static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = {
{
@@ -63,7 +171,7 @@ struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4),
.global = true,
};
-EXPORT_SYMBOL(devlink_dpipe_header_ipv4);
+EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv4);
static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = {
{
@@ -80,20 +188,42 @@ struct devlink_dpipe_header devlink_dpipe_header_ipv6 = {
.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6),
.global = true,
};
-EXPORT_SYMBOL(devlink_dpipe_header_ipv6);
+EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv6);
EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
-static LIST_HEAD(devlink_list);
+static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
+ [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
+ [DEVLINK_PORT_FN_ATTR_STATE] =
+ NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
+ DEVLINK_PORT_FN_STATE_ACTIVE),
+};
+
+static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
+ [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG },
+};
-/* devlink_mutex
+static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
+#define DEVLINK_REGISTERED XA_MARK_1
+#define DEVLINK_UNREGISTERING XA_MARK_2
+
+/* devlink instances are open to the access from the user space after
+ * devlink_register() call. Such logical barrier allows us to have certain
+ * expectations related to locking.
+ *
+ * Before *_register() - we are in initialization stage and no parallel
+ * access possible to the devlink instance. All drivers perform that phase
+ * by implicitly holding device_lock.
*
- * An overall lock guarding every operation coming from userspace.
- * It also guards devlink devices list and it is taken when
- * driver registers/unregisters it.
+ * After *_register() - users and driver can access devlink instance at
+ * the same time.
*/
-static DEFINE_MUTEX(devlink_mutex);
+#define ASSERT_DEVLINK_REGISTERED(d) \
+ WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED))
+#define ASSERT_DEVLINK_NOT_REGISTERED(d) \
+ WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED))
struct net *devlink_net(const struct devlink *devlink)
{
@@ -101,23 +231,127 @@ struct net *devlink_net(const struct devlink *devlink)
}
EXPORT_SYMBOL_GPL(devlink_net);
-static void __devlink_net_set(struct devlink *devlink, struct net *net)
+static void __devlink_put_rcu(struct rcu_head *head)
{
- write_pnet(&devlink->_net, net);
+ struct devlink *devlink = container_of(head, struct devlink, rcu);
+
+ complete(&devlink->comp);
}
-void devlink_net_set(struct devlink *devlink, struct net *net)
+void devlink_put(struct devlink *devlink)
{
- if (WARN_ON(devlink->registered))
- return;
- __devlink_net_set(devlink, net);
+ if (refcount_dec_and_test(&devlink->refcount))
+ /* Make sure unregister operation that may await the completion
+ * is unblocked only after all users are after the end of
+ * RCU grace period.
+ */
+ call_rcu(&devlink->rcu, __devlink_put_rcu);
}
-EXPORT_SYMBOL_GPL(devlink_net_set);
+
+struct devlink *__must_check devlink_try_get(struct devlink *devlink)
+{
+ if (refcount_inc_not_zero(&devlink->refcount))
+ return devlink;
+ return NULL;
+}
+
+void devl_assert_locked(struct devlink *devlink)
+{
+ lockdep_assert_held(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_assert_locked);
+
+#ifdef CONFIG_LOCKDEP
+/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */
+bool devl_lock_is_held(struct devlink *devlink)
+{
+ return lockdep_is_held(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_lock_is_held);
+#endif
+
+void devl_lock(struct devlink *devlink)
+{
+ mutex_lock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_lock);
+
+int devl_trylock(struct devlink *devlink)
+{
+ return mutex_trylock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_trylock);
+
+void devl_unlock(struct devlink *devlink)
+{
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_unlock);
+
+static struct devlink *
+devlinks_xa_find_get(struct net *net, unsigned long *indexp, xa_mark_t filter,
+ void * (*xa_find_fn)(struct xarray *, unsigned long *,
+ unsigned long, xa_mark_t))
+{
+ struct devlink *devlink;
+
+ rcu_read_lock();
+retry:
+ devlink = xa_find_fn(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED);
+ if (!devlink)
+ goto unlock;
+
+ /* In case devlink_unregister() was already called and "unregistering"
+ * mark was set, do not allow to get a devlink reference here.
+ * This prevents live-lock of devlink_unregister() wait for completion.
+ */
+ if (xa_get_mark(&devlinks, *indexp, DEVLINK_UNREGISTERING))
+ goto retry;
+
+ /* For a possible retry, the xa_find_after() should be always used */
+ xa_find_fn = xa_find_after;
+ if (!devlink_try_get(devlink))
+ goto retry;
+ if (!net_eq(devlink_net(devlink), net)) {
+ devlink_put(devlink);
+ goto retry;
+ }
+unlock:
+ rcu_read_unlock();
+ return devlink;
+}
+
+static struct devlink *devlinks_xa_find_get_first(struct net *net,
+ unsigned long *indexp,
+ xa_mark_t filter)
+{
+ return devlinks_xa_find_get(net, indexp, filter, xa_find);
+}
+
+static struct devlink *devlinks_xa_find_get_next(struct net *net,
+ unsigned long *indexp,
+ xa_mark_t filter)
+{
+ return devlinks_xa_find_get(net, indexp, filter, xa_find_after);
+}
+
+/* Iterate over devlink pointers which were possible to get reference to.
+ * devlink_put() needs to be called for each iterated devlink pointer
+ * in loop body in order to release the reference.
+ */
+#define devlinks_xa_for_each_get(net, index, devlink, filter) \
+ for (index = 0, \
+ devlink = devlinks_xa_find_get_first(net, &index, filter); \
+ devlink; devlink = devlinks_xa_find_get_next(net, &index, filter))
+
+#define devlinks_xa_for_each_registered_get(net, index, devlink) \
+ devlinks_xa_for_each_get(net, index, devlink, DEVLINK_REGISTERED)
static struct devlink *devlink_get_from_attrs(struct net *net,
struct nlattr **attrs)
{
struct devlink *devlink;
+ unsigned long index;
char *busname;
char *devname;
@@ -127,22 +361,22 @@ static struct devlink *devlink_get_from_attrs(struct net *net,
busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
- lockdep_assert_held(&devlink_mutex);
-
- list_for_each_entry(devlink, &devlink_list, list) {
+ devlinks_xa_for_each_registered_get(net, index, devlink) {
if (strcmp(devlink->dev->bus->name, busname) == 0 &&
- strcmp(dev_name(devlink->dev), devname) == 0 &&
- net_eq(devlink_net(devlink), net))
+ strcmp(dev_name(devlink->dev), devname) == 0)
return devlink;
+ devlink_put(devlink);
}
return ERR_PTR(-ENODEV);
}
-static struct devlink *devlink_get_from_info(struct genl_info *info)
-{
- return devlink_get_from_attrs(genl_info_net(info), info->attrs);
-}
+#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \
+ WARN_ON_ONCE(!(devlink_port)->registered)
+#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \
+ WARN_ON_ONCE((devlink_port)->registered)
+#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \
+ WARN_ON_ONCE(!(devlink_port)->initialized)
static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
unsigned int port_index)
@@ -183,6 +417,132 @@ static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
return devlink_port_get_from_attrs(devlink, info->attrs);
}
+static inline bool
+devlink_rate_is_leaf(struct devlink_rate *devlink_rate)
+{
+ return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF;
+}
+
+static inline bool
+devlink_rate_is_node(struct devlink_rate *devlink_rate)
+{
+ return devlink_rate->type == DEVLINK_RATE_TYPE_NODE;
+}
+
+static struct devlink_rate *
+devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ struct devlink_rate *devlink_rate;
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_from_attrs(devlink, info->attrs);
+ if (IS_ERR(devlink_port))
+ return ERR_CAST(devlink_port);
+ devlink_rate = devlink_port->devlink_rate;
+ return devlink_rate ?: ERR_PTR(-ENODEV);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name)
+{
+ static struct devlink_rate *devlink_rate;
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ if (devlink_rate_is_node(devlink_rate) &&
+ !strcmp(node_name, devlink_rate->name))
+ return devlink_rate;
+ }
+ return ERR_PTR(-ENODEV);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
+{
+ const char *rate_node_name;
+ size_t len;
+
+ if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME])
+ return ERR_PTR(-EINVAL);
+ rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]);
+ len = strlen(rate_node_name);
+ /* Name cannot be empty or decimal number */
+ if (!len || strspn(rate_node_name, "0123456789") == len)
+ return ERR_PTR(-EINVAL);
+
+ return devlink_rate_node_get_by_name(devlink, rate_node_name);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ return devlink_rate_node_get_from_attrs(devlink, info->attrs);
+}
+
+static struct devlink_rate *
+devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ struct nlattr **attrs = info->attrs;
+
+ if (attrs[DEVLINK_ATTR_PORT_INDEX])
+ return devlink_rate_leaf_get_from_info(devlink, info);
+ else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME])
+ return devlink_rate_node_get_from_info(devlink, info);
+ else
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_by_index(struct devlink *devlink,
+ unsigned int linecard_index)
+{
+ struct devlink_linecard *devlink_linecard;
+
+ list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) {
+ if (devlink_linecard->index == linecard_index)
+ return devlink_linecard;
+ }
+ return NULL;
+}
+
+static bool devlink_linecard_index_exists(struct devlink *devlink,
+ unsigned int linecard_index)
+{
+ return devlink_linecard_get_by_index(devlink, linecard_index);
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) {
+ u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]);
+ struct devlink_linecard *linecard;
+
+ mutex_lock(&devlink->linecards_lock);
+ linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+ if (linecard)
+ refcount_inc(&linecard->refcount);
+ mutex_unlock(&devlink->linecards_lock);
+ if (!linecard)
+ return ERR_PTR(-ENODEV);
+ return linecard;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ return devlink_linecard_get_from_attrs(devlink, info->attrs);
+}
+
+static void devlink_linecard_put(struct devlink_linecard *linecard)
+{
+ if (refcount_dec_and_test(&linecard->refcount)) {
+ mutex_destroy(&linecard->state_lock);
+ kfree(linecard);
+ }
+}
+
struct devlink_sb {
struct list_head list;
unsigned int index;
@@ -343,8 +703,16 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
struct devlink_region {
struct devlink *devlink;
+ struct devlink_port *port;
struct list_head list;
- const char *name;
+ union {
+ const struct devlink_region_ops *ops;
+ const struct devlink_port_region_ops *port_ops;
+ };
+ struct mutex snapshot_lock; /* protects snapshot_list,
+ * max_snapshots and cur_snapshots
+ * consistency.
+ */
struct list_head snapshot_list;
u32 max_snapshots;
u32 cur_snapshots;
@@ -354,7 +722,6 @@ struct devlink_region {
struct devlink_snapshot {
struct list_head list;
struct devlink_region *region;
- devlink_snapshot_data_dest_t *data_destructor;
u8 *data;
u32 id;
};
@@ -365,7 +732,20 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
struct devlink_region *region;
list_for_each_entry(region, &devlink->region_list, list)
- if (!strcmp(region->name, region_name))
+ if (!strcmp(region->ops->name, region_name))
+ return region;
+
+ return NULL;
+}
+
+static struct devlink_region *
+devlink_port_region_get_by_name(struct devlink_port *port,
+ const char *region_name)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &port->region_list, list)
+ if (!strcmp(region->ops->name, region_name))
return region;
return NULL;
@@ -383,78 +763,83 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
return NULL;
}
-#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0)
-#define DEVLINK_NL_FLAG_NEED_PORT BIT(1)
-#define DEVLINK_NL_FLAG_NEED_SB BIT(2)
-
-/* The per devlink instance lock is taken by default in the pre-doit
- * operation, yet several commands do not require this. The global
- * devlink lock is taken and protects from disruption by user-calls.
- */
-#define DEVLINK_NL_FLAG_NO_LOCK BIT(3)
+#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
+#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1)
+#define DEVLINK_NL_FLAG_NEED_RATE BIT(2)
+#define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3)
+#define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4)
static int devlink_nl_pre_doit(const struct genl_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
+ struct devlink_linecard *linecard;
+ struct devlink_port *devlink_port;
struct devlink *devlink;
int err;
- mutex_lock(&devlink_mutex);
- devlink = devlink_get_from_info(info);
- if (IS_ERR(devlink)) {
- mutex_unlock(&devlink_mutex);
+ devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs);
+ if (IS_ERR(devlink))
return PTR_ERR(devlink);
- }
- if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
- mutex_lock(&devlink->lock);
- if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
- info->user_ptr[0] = devlink;
- } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
- struct devlink_port *devlink_port;
-
+ devl_lock(devlink);
+ info->user_ptr[0] = devlink;
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
devlink_port = devlink_port_get_from_info(devlink, info);
if (IS_ERR(devlink_port)) {
err = PTR_ERR(devlink_port);
goto unlock;
}
- info->user_ptr[0] = devlink_port;
- }
- if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) {
- struct devlink_sb *devlink_sb;
+ info->user_ptr[1] = devlink_port;
+ } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) {
+ devlink_port = devlink_port_get_from_info(devlink, info);
+ if (!IS_ERR(devlink_port))
+ info->user_ptr[1] = devlink_port;
+ } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE) {
+ struct devlink_rate *devlink_rate;
+
+ devlink_rate = devlink_rate_get_from_info(devlink, info);
+ if (IS_ERR(devlink_rate)) {
+ err = PTR_ERR(devlink_rate);
+ goto unlock;
+ }
+ info->user_ptr[1] = devlink_rate;
+ } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_RATE_NODE) {
+ struct devlink_rate *rate_node;
- devlink_sb = devlink_sb_get_from_info(devlink, info);
- if (IS_ERR(devlink_sb)) {
- err = PTR_ERR(devlink_sb);
+ rate_node = devlink_rate_node_get_from_info(devlink, info);
+ if (IS_ERR(rate_node)) {
+ err = PTR_ERR(rate_node);
+ goto unlock;
+ }
+ info->user_ptr[1] = rate_node;
+ } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) {
+ linecard = devlink_linecard_get_from_info(devlink, info);
+ if (IS_ERR(linecard)) {
+ err = PTR_ERR(linecard);
goto unlock;
}
- info->user_ptr[1] = devlink_sb;
+ info->user_ptr[1] = linecard;
}
return 0;
unlock:
- if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
- mutex_unlock(&devlink->lock);
- mutex_unlock(&devlink_mutex);
+ devl_unlock(devlink);
+ devlink_put(devlink);
return err;
}
static void devlink_nl_post_doit(const struct genl_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
+ struct devlink_linecard *linecard;
struct devlink *devlink;
- /* When devlink changes netns, it would not be found
- * by devlink_get_from_info(). So try if it is stored first.
- */
- if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
- devlink = info->user_ptr[0];
- } else {
- devlink = devlink_get_from_info(info);
- WARN_ON(IS_ERR(devlink));
+ devlink = info->user_ptr[0];
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_LINECARD) {
+ linecard = info->user_ptr[1];
+ devlink_linecard_put(linecard);
}
- if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
- mutex_unlock(&devlink->lock);
- mutex_unlock(&devlink_mutex);
+ devl_unlock(devlink);
+ devlink_put(devlink);
}
static struct genl_family devlink_nl_family;
@@ -476,10 +861,150 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
return 0;
}
+static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+ struct nlattr *nested_attr;
+
+ nested_attr = nla_nest_start(msg, DEVLINK_ATTR_NESTED_DEVLINK);
+ if (!nested_attr)
+ return -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nested_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, nested_attr);
+ return -EMSGSIZE;
+}
+
+struct devlink_reload_combination {
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
+};
+
+static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = {
+ {
+ /* can't reinitialize driver with no down time */
+ .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ .limit = DEVLINK_RELOAD_LIMIT_NO_RESET,
+ },
+};
+
+static bool
+devlink_reload_combination_is_invalid(enum devlink_reload_action action,
+ enum devlink_reload_limit limit)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++)
+ if (devlink_reload_invalid_combinations[i].action == action &&
+ devlink_reload_invalid_combinations[i].limit == limit)
+ return true;
+ return false;
+}
+
+static bool
+devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action)
+{
+ return test_bit(action, &devlink->ops->reload_actions);
+}
+
+static bool
+devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit)
+{
+ return test_bit(limit, &devlink->ops->reload_limits);
+}
+
+static int devlink_reload_stat_put(struct sk_buff *msg,
+ enum devlink_reload_limit limit, u32 value)
+{
+ struct nlattr *reload_stats_entry;
+
+ reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY);
+ if (!reload_stats_entry)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) ||
+ nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value))
+ goto nla_put_failure;
+ nla_nest_end(msg, reload_stats_entry);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_entry);
+ return -EMSGSIZE;
+}
+
+static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote)
+{
+ struct nlattr *reload_stats_attr, *act_info, *act_stats;
+ int i, j, stat_idx;
+ u32 value;
+
+ if (!is_remote)
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS);
+ else
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS);
+
+ if (!reload_stats_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) {
+ if ((!is_remote &&
+ !devlink_reload_action_is_supported(devlink, i)) ||
+ i == DEVLINK_RELOAD_ACTION_UNSPEC)
+ continue;
+ act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO);
+ if (!act_info)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i))
+ goto action_info_nest_cancel;
+ act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS);
+ if (!act_stats)
+ goto action_info_nest_cancel;
+
+ for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) {
+ /* Remote stats are shown even if not locally supported.
+ * Stats of actions with unspecified limit are shown
+ * though drivers don't need to register unspecified
+ * limit.
+ */
+ if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC &&
+ !devlink_reload_limit_is_supported(devlink, j)) ||
+ devlink_reload_combination_is_invalid(i, j))
+ continue;
+
+ stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i;
+ if (!is_remote)
+ value = devlink->stats.reload_stats[stat_idx];
+ else
+ value = devlink->stats.remote_reload_stats[stat_idx];
+ if (devlink_reload_stat_put(msg, j, value))
+ goto action_stats_nest_cancel;
+ }
+ nla_nest_end(msg, act_stats);
+ nla_nest_end(msg, act_info);
+ }
+ nla_nest_end(msg, reload_stats_attr);
+ return 0;
+
+action_stats_nest_cancel:
+ nla_nest_cancel(msg, act_stats);
+action_info_nest_cancel:
+ nla_nest_cancel(msg, act_info);
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_attr);
+ return -EMSGSIZE;
+}
+
static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
enum devlink_command cmd, u32 portid,
u32 seq, int flags)
{
+ struct nlattr *dev_stats;
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
@@ -491,9 +1016,21 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))
goto nla_put_failure;
+ dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS);
+ if (!dev_stats)
+ goto nla_put_failure;
+
+ if (devlink_reload_stats_put(msg, devlink, false))
+ goto dev_stats_nest_cancel;
+ if (devlink_reload_stats_put(msg, devlink, true))
+ goto dev_stats_nest_cancel;
+
+ nla_nest_end(msg, dev_stats);
genlmsg_end(msg, hdr);
return 0;
+dev_stats_nest_cancel:
+ nla_nest_cancel(msg, dev_stats);
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
@@ -505,6 +1042,7 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
int err;
WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+ WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED));
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
@@ -525,21 +1063,41 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
- if (!attrs->set)
+ if (!devlink_port->attrs_set)
return 0;
+ if (attrs->lanes) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes))
+ return -EMSGSIZE;
+ }
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable))
+ return -EMSGSIZE;
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
return -EMSGSIZE;
switch (devlink_port->attrs.flavour) {
case DEVLINK_PORT_FLAVOUR_PCI_PF:
- if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
- attrs->pci_pf.pf))
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_pf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external))
return -EMSGSIZE;
break;
case DEVLINK_PORT_FLAVOUR_PCI_VF:
- if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
- attrs->pci_vf.pf) ||
- nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER,
- attrs->pci_vf.vf))
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_vf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_SF:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_sf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+ attrs->pci_sf.pf) ||
+ nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER,
+ attrs->pci_sf.sf))
return -EMSGSIZE;
break;
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
@@ -563,11 +1121,166 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
return 0;
}
-static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops,
+ struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ u8 hw_addr[MAX_ADDR_LEN];
+ int hw_addr_len;
+ int err;
+
+ if (!ops->port_function_hw_addr_get)
+ return 0;
+
+ err = ops->port_function_hw_addr_get(port, hw_addr, &hw_addr_len,
+ extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr);
+ if (err)
+ return err;
+ *msg_updated = true;
+ return 0;
+}
+
+static int devlink_nl_rate_fill(struct sk_buff *msg,
+ struct devlink_rate *devlink_rate,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type))
+ goto nla_put_failure;
+
+ if (devlink_rate_is_leaf(devlink_rate)) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ devlink_rate->devlink_port->index))
+ goto nla_put_failure;
+ } else if (devlink_rate_is_node(devlink_rate)) {
+ if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME,
+ devlink_rate->name))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE,
+ devlink_rate->tx_share, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX,
+ devlink_rate->tx_max, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (devlink_rate->parent)
+ if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
+ devlink_rate->parent->name))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static bool
+devlink_port_fn_state_valid(enum devlink_port_fn_state state)
+{
+ return state == DEVLINK_PORT_FN_STATE_INACTIVE ||
+ state == DEVLINK_PORT_FN_STATE_ACTIVE;
+}
+
+static bool
+devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate)
+{
+ return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED ||
+ opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED;
+}
+
+static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
+ struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ enum devlink_port_fn_opstate opstate;
+ enum devlink_port_fn_state state;
+ int err;
+
+ if (!ops->port_fn_state_get)
+ return 0;
+
+ err = ops->port_fn_state_get(port, &state, &opstate, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ if (!devlink_port_fn_state_valid(state)) {
+ WARN_ON_ONCE(1);
+ NL_SET_ERR_MSG_MOD(extack, "Invalid state read from driver");
+ return -EINVAL;
+ }
+ if (!devlink_port_fn_opstate_valid(opstate)) {
+ WARN_ON_ONCE(1);
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid operational state read from driver");
+ return -EINVAL;
+ }
+ if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) ||
+ nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate))
+ return -EMSGSIZE;
+ *msg_updated = true;
+ return 0;
+}
+
+static int
+devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
+ struct netlink_ext_ack *extack)
+{
+ const struct devlink_ops *ops;
+ struct nlattr *function_attr;
+ bool msg_updated = false;
+ int err;
+
+ function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION);
+ if (!function_attr)
+ return -EMSGSIZE;
+
+ ops = port->devlink->ops;
+ err = devlink_port_fn_hw_addr_fill(ops, port, msg, extack,
+ &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
+out:
+ if (err || !msg_updated)
+ nla_nest_cancel(msg, function_attr);
+ else
+ nla_nest_end(msg, function_attr);
+ return err;
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg,
struct devlink_port *devlink_port,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags)
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
{
+ struct devlink *devlink = devlink_port->devlink;
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
@@ -579,6 +1292,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
goto nla_put_failure;
+ /* Hold rtnl lock while accessing port's netdev attributes. */
+ rtnl_lock();
spin_lock_bh(&devlink_port->type_lock);
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
goto nla_put_failure_type_locked;
@@ -587,9 +1302,10 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
devlink_port->desired_type))
goto nla_put_failure_type_locked;
if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+ struct net *net = devlink_net(devlink_port->devlink);
struct net_device *netdev = devlink_port->type_dev;
- if (netdev &&
+ if (netdev && net_eq(net, dev_net(netdev)) &&
(nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
netdev->ifindex) ||
nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
@@ -605,14 +1321,22 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
goto nla_put_failure_type_locked;
}
spin_unlock_bh(&devlink_port->type_lock);
+ rtnl_unlock();
if (devlink_nl_port_attrs_put(msg, devlink_port))
goto nla_put_failure;
+ if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack))
+ goto nla_put_failure;
+ if (devlink_port->linecard &&
+ nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX,
+ devlink_port->linecard->index))
+ goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure_type_locked:
spin_unlock_bh(&devlink_port->type_lock);
+ rtnl_unlock();
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
@@ -625,23 +1349,124 @@ static void devlink_port_notify(struct devlink_port *devlink_port,
struct sk_buff *msg;
int err;
- if (!devlink_port->registered)
+ WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+ if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
return;
- WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg,
+ 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static void devlink_rate_notify(struct devlink_rate *devlink_rate,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL);
+
+ if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+ return;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
- err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
+ err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL);
if (err) {
nlmsg_free(msg);
return;
}
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg,
+ 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_rate *devlink_rate;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ unsigned long index;
+ int idx = 0;
+ int err = 0;
+
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ devl_lock(devlink);
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ enum devlink_command cmd = DEVLINK_CMD_RATE_NEW;
+ u32 id = NETLINK_CB(cb->skb).portid;
+
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, NULL);
+ if (err) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ goto out;
+ }
+ idx++;
+ }
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ }
+out:
+ if (err != -EMSGSIZE)
+ return err;
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_rate *devlink_rate = info->user_ptr[1];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static bool
+devlink_rate_is_parent_node(struct devlink_rate *devlink_rate,
+ struct devlink_rate *parent)
+{
+ while (parent) {
+ if (parent == devlink_rate)
+ return true;
+ parent = parent->parent;
+ }
+ return false;
}
static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
@@ -669,27 +1494,26 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
{
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
if (idx < start) {
idx++;
+ devlink_put(devlink);
continue;
}
+
err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ devlink_put(devlink);
if (err)
goto out;
idx++;
}
out:
- mutex_unlock(&devlink_mutex);
-
cb->args[0] = idx;
return msg->len;
}
@@ -697,8 +1521,7 @@ out:
static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink *devlink = devlink_port->devlink;
+ struct devlink_port *devlink_port = info->user_ptr[1];
struct sk_buff *msg;
int err;
@@ -706,9 +1529,9 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
if (!msg)
return -ENOMEM;
- err = devlink_nl_port_fill(msg, devlink, devlink_port,
- DEVLINK_CMD_PORT_NEW,
- info->snd_portid, info->snd_seq, 0);
+ err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
if (err) {
nlmsg_free(msg);
return err;
@@ -723,125 +1546,871 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_port *devlink_port;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- mutex_lock(&devlink->lock);
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ devl_lock(devlink);
list_for_each_entry(devlink_port, &devlink->port_list, list) {
if (idx < start) {
idx++;
continue;
}
- err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ err = devlink_nl_port_fill(msg, devlink_port,
DEVLINK_CMD_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
+ NLM_F_MULTI, cb->extack);
if (err) {
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
goto out;
}
idx++;
}
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
}
out:
- mutex_unlock(&devlink_mutex);
-
cb->args[0] = idx;
return msg->len;
}
-static int devlink_port_type_set(struct devlink *devlink,
- struct devlink_port *devlink_port,
+static int devlink_port_type_set(struct devlink_port *devlink_port,
enum devlink_port_type port_type)
{
int err;
- if (devlink->ops->port_type_set) {
- if (port_type == DEVLINK_PORT_TYPE_NOTSET)
+ if (!devlink_port->devlink->ops->port_type_set)
+ return -EOPNOTSUPP;
+
+ if (port_type == devlink_port->type)
+ return 0;
+
+ err = devlink_port->devlink->ops->port_type_set(devlink_port,
+ port_type);
+ if (err)
+ return err;
+
+ devlink_port->desired_type = port_type;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+
+static int devlink_port_function_hw_addr_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const struct devlink_ops *ops = port->devlink->ops;
+ const u8 *hw_addr;
+ int hw_addr_len;
+
+ hw_addr = nla_data(attr);
+ hw_addr_len = nla_len(attr);
+ if (hw_addr_len > MAX_ADDR_LEN) {
+ NL_SET_ERR_MSG_MOD(extack, "Port function hardware address too long");
+ return -EINVAL;
+ }
+ if (port->type == DEVLINK_PORT_TYPE_ETH) {
+ if (hw_addr_len != ETH_ALEN) {
+ NL_SET_ERR_MSG_MOD(extack, "Address must be 6 bytes for Ethernet device");
return -EINVAL;
- if (port_type == devlink_port->type)
- return 0;
- err = devlink->ops->port_type_set(devlink_port, port_type);
+ }
+ if (!is_unicast_ether_addr(hw_addr)) {
+ NL_SET_ERR_MSG_MOD(extack, "Non-unicast hardware address unsupported");
+ return -EINVAL;
+ }
+ }
+
+ if (!ops->port_function_hw_addr_set) {
+ NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes");
+ return -EOPNOTSUPP;
+ }
+
+ return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len,
+ extack);
+}
+
+static int devlink_port_fn_state_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ enum devlink_port_fn_state state;
+ const struct devlink_ops *ops;
+
+ state = nla_get_u8(attr);
+ ops = port->devlink->ops;
+ if (!ops->port_fn_state_set) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Function does not support state setting");
+ return -EOPNOTSUPP;
+ }
+ return ops->port_fn_state_set(port, state, extack);
+}
+
+static int devlink_port_function_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr,
+ devlink_function_nl_policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Fail to parse port function attributes");
+ return err;
+ }
+
+ attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR];
+ if (attr) {
+ err = devlink_port_function_hw_addr_set(port, attr, extack);
if (err)
return err;
- devlink_port->desired_type = port_type;
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
- return 0;
}
- return -EOPNOTSUPP;
+ /* Keep this as the last function attribute set, so that when
+ * multiple port function attributes are set along with state,
+ * Those can be applied first before activating the state.
+ */
+ attr = tb[DEVLINK_PORT_FN_ATTR_STATE];
+ if (attr)
+ err = devlink_port_fn_state_set(port, attr, extack);
+
+ if (!err)
+ devlink_port_notify(port, DEVLINK_CMD_PORT_NEW);
+ return err;
}
static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink *devlink = devlink_port->devlink;
+ struct devlink_port *devlink_port = info->user_ptr[1];
int err;
if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
enum devlink_port_type port_type;
port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
- err = devlink_port_type_set(devlink, devlink_port, port_type);
+ err = devlink_port_type_set(devlink_port, port_type);
if (err)
return err;
}
- return 0;
-}
-static int devlink_port_split(struct devlink *devlink, u32 port_index,
- u32 count, struct netlink_ext_ack *extack)
+ if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) {
+ struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION];
+ struct netlink_ext_ack *extack = info->extack;
-{
- if (devlink->ops->port_split)
- return devlink->ops->port_split(devlink, port_index, count,
- extack);
- return -EOPNOTSUPP;
+ err = devlink_port_function_set(devlink_port, attr, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
}
static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
struct genl_info *info)
{
+ struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = info->user_ptr[0];
- u32 port_index;
u32 count;
- if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
- !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT))
return -EINVAL;
+ if (!devlink->ops->port_split)
+ return -EOPNOTSUPP;
- port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
- return devlink_port_split(devlink, port_index, count, info->extack);
-}
-static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,
- struct netlink_ext_ack *extack)
+ if (!devlink_port->attrs.splittable) {
+ /* Split ports cannot be split. */
+ if (devlink_port->attrs.split)
+ NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split further");
+ else
+ NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split");
+ return -EINVAL;
+ }
-{
- if (devlink->ops->port_unsplit)
- return devlink->ops->port_unsplit(devlink, port_index, extack);
- return -EOPNOTSUPP;
+ if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Invalid split count");
+ return -EINVAL;
+ }
+
+ return devlink->ops->port_split(devlink, devlink_port, count,
+ info->extack);
}
static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
struct genl_info *info)
{
+ struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = info->user_ptr[0];
- u32 port_index;
- if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
+ if (!devlink->ops->port_unsplit)
+ return -EOPNOTSUPP;
+ return devlink->ops->port_unsplit(devlink, devlink_port, info->extack);
+}
+
+static int devlink_port_new_notify(struct devlink *devlink,
+ unsigned int port_index,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port;
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ lockdep_assert_held(&devlink->lock);
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW,
+ info->snd_portid, info->snd_seq, 0, NULL);
+ if (err)
+ goto out;
+
+ return genlmsg_reply(msg, info);
+
+out:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink_port_new_attrs new_attrs = {};
+ struct devlink *devlink = info->user_ptr[0];
+ unsigned int new_port_index;
+ int err;
+
+ if (!devlink->ops->port_new || !devlink->ops->port_del)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] ||
+ !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) {
+ NL_SET_ERR_MSG_MOD(extack, "Port flavour or PCI PF are not specified");
return -EINVAL;
+ }
+ new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]);
+ new_attrs.pfnum =
+ nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ /* Port index of the new port being created by driver. */
+ new_attrs.port_index =
+ nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ new_attrs.port_index_valid = true;
+ }
+ if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) {
+ new_attrs.controller =
+ nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]);
+ new_attrs.controller_valid = true;
+ }
+ if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF &&
+ info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) {
+ new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]);
+ new_attrs.sfnum_valid = true;
+ }
+
+ err = devlink->ops->port_new(devlink, &new_attrs, extack,
+ &new_port_index);
+ if (err)
+ return err;
+
+ err = devlink_port_new_notify(devlink, new_port_index, info);
+ if (err && err != -ENODEV) {
+ /* Fail to send the response; destroy newly created port. */
+ devlink->ops->port_del(devlink, new_port_index, extack);
+ }
+ return err;
+}
+static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ unsigned int port_index;
+
+ if (!devlink->ops->port_del)
+ return -EOPNOTSUPP;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_INDEX)) {
+ NL_SET_ERR_MSG_MOD(extack, "Port index is not specified");
+ return -EINVAL;
+ }
port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
- return devlink_port_unsplit(devlink, port_index, info->extack);
+
+ return devlink->ops->port_del(devlink, port_index, extack);
+}
+
+static int
+devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
+ struct genl_info *info,
+ struct nlattr *nla_parent)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ const char *parent_name = nla_data(nla_parent);
+ const struct devlink_ops *ops = devlink->ops;
+ size_t len = strlen(parent_name);
+ struct devlink_rate *parent;
+ int err = -EOPNOTSUPP;
+
+ parent = devlink_rate->parent;
+ if (parent && len) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Rate object already has parent.");
+ return -EBUSY;
+ } else if (parent && !len) {
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_parent_set(devlink_rate, NULL,
+ devlink_rate->priv, NULL,
+ info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_parent_set(devlink_rate, NULL,
+ devlink_rate->priv, NULL,
+ info->extack);
+ if (err)
+ return err;
+
+ refcount_dec(&parent->refcnt);
+ devlink_rate->parent = NULL;
+ } else if (!parent && len) {
+ parent = devlink_rate_node_get_by_name(devlink, parent_name);
+ if (IS_ERR(parent))
+ return -ENODEV;
+
+ if (parent == devlink_rate) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Parent to self is not allowed");
+ return -EINVAL;
+ }
+
+ if (devlink_rate_is_node(devlink_rate) &&
+ devlink_rate_is_parent_node(devlink_rate, parent->parent)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Node is already a parent of parent node.");
+ return -EEXIST;
+ }
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_parent_set(devlink_rate, parent,
+ devlink_rate->priv, parent->priv,
+ info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_parent_set(devlink_rate, parent,
+ devlink_rate->priv, parent->priv,
+ info->extack);
+ if (err)
+ return err;
+
+ refcount_inc(&parent->refcnt);
+ devlink_rate->parent = parent;
+ }
+
+ return 0;
+}
+
+static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
+ const struct devlink_ops *ops,
+ struct genl_info *info)
+{
+ struct nlattr *nla_parent, **attrs = info->attrs;
+ int err = -EOPNOTSUPP;
+ u64 rate;
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) {
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ if (err)
+ return err;
+ devlink_rate->tx_share = rate;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) {
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ if (err)
+ return err;
+ devlink_rate->tx_max = rate;
+ }
+
+ nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME];
+ if (nla_parent) {
+ err = devlink_nl_rate_parent_node_set(devlink_rate, info,
+ nla_parent);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
+ struct genl_info *info,
+ enum devlink_rate_type type)
+{
+ struct nlattr **attrs = info->attrs;
+
+ if (type == DEVLINK_RATE_TYPE_LEAF) {
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
+ !ops->rate_leaf_parent_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs");
+ return false;
+ }
+ } else if (type == DEVLINK_RATE_TYPE_NODE) {
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
+ !ops->rate_node_parent_set) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes");
+ return false;
+ }
+ } else {
+ WARN(1, "Unknown type of rate object");
+ return false;
+ }
+
+ return true;
+}
+
+static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_rate *devlink_rate = info->user_ptr[1];
+ struct devlink *devlink = devlink_rate->devlink;
+ const struct devlink_ops *ops = devlink->ops;
+ int err;
+
+ if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type))
+ return -EOPNOTSUPP;
+
+ err = devlink_nl_rate_set(devlink_rate, ops, info);
+
+ if (!err)
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
+ return err;
+}
+
+static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *rate_node;
+ const struct devlink_ops *ops;
+ int err;
+
+ ops = devlink->ops;
+ if (!ops || !ops->rate_node_new || !ops->rate_node_del) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Rate nodes aren't supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE))
+ return -EOPNOTSUPP;
+
+ rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs);
+ if (!IS_ERR(rate_node))
+ return -EEXIST;
+ else if (rate_node == ERR_PTR(-EINVAL))
+ return -EINVAL;
+
+ rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
+ if (!rate_node)
+ return -ENOMEM;
+
+ rate_node->devlink = devlink;
+ rate_node->type = DEVLINK_RATE_TYPE_NODE;
+ rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL);
+ if (!rate_node->name) {
+ err = -ENOMEM;
+ goto err_strdup;
+ }
+
+ err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack);
+ if (err)
+ goto err_node_new;
+
+ err = devlink_nl_rate_set(rate_node, ops, info);
+ if (err)
+ goto err_rate_set;
+
+ refcount_set(&rate_node->refcnt, 1);
+ list_add(&rate_node->list, &devlink->rate_list);
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+ return 0;
+
+err_rate_set:
+ ops->rate_node_del(rate_node, rate_node->priv, info->extack);
+err_node_new:
+ kfree(rate_node->name);
+err_strdup:
+ kfree(rate_node);
+ return err;
+}
+
+static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_rate *rate_node = info->user_ptr[1];
+ struct devlink *devlink = rate_node->devlink;
+ const struct devlink_ops *ops = devlink->ops;
+ int err;
+
+ if (refcount_read(&rate_node->refcnt) > 1) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Node has children. Cannot delete node.");
+ return -EBUSY;
+ }
+
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
+ err = ops->rate_node_del(rate_node, rate_node->priv, info->extack);
+ if (rate_node->parent)
+ refcount_dec(&rate_node->parent->refcnt);
+ list_del(&rate_node->list);
+ kfree(rate_node->name);
+ kfree(rate_node);
+ return err;
+}
+
+struct devlink_linecard_type {
+ const char *type;
+ const void *priv;
+};
+
+static int devlink_nl_linecard_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_linecard *linecard,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_linecard_type *linecard_type;
+ struct nlattr *attr;
+ void *hdr;
+ int i;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state))
+ goto nla_put_failure;
+ if (linecard->type &&
+ nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type))
+ goto nla_put_failure;
+
+ if (linecard->types_count) {
+ attr = nla_nest_start(msg,
+ DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES);
+ if (!attr)
+ goto nla_put_failure;
+ for (i = 0; i < linecard->types_count; i++) {
+ linecard_type = &linecard->types[i];
+ if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE,
+ linecard_type->type)) {
+ nla_nest_cancel(msg, attr);
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(msg, attr);
+ }
+
+ if (linecard->nested_devlink &&
+ devlink_nl_put_nested_handle(msg, linecard->nested_devlink))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_linecard_notify(struct devlink_linecard *linecard,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = linecard->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW &&
+ cmd != DEVLINK_CMD_LINECARD_DEL);
+
+ if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0,
+ NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_linecard *linecard = info->user_ptr[1];
+ struct devlink *devlink = linecard->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ mutex_lock(&linecard->state_lock);
+ err = devlink_nl_linecard_fill(msg, devlink, linecard,
+ DEVLINK_CMD_LINECARD_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ mutex_unlock(&linecard->state_lock);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink_linecard *linecard;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ unsigned long index;
+ int idx = 0;
+ int err;
+
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ mutex_lock(&devlink->linecards_lock);
+ list_for_each_entry(linecard, &devlink->linecard_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ mutex_lock(&linecard->state_lock);
+ err = devlink_nl_linecard_fill(msg, devlink, linecard,
+ DEVLINK_CMD_LINECARD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ cb->extack);
+ mutex_unlock(&linecard->state_lock);
+ if (err) {
+ mutex_unlock(&devlink->linecards_lock);
+ devlink_put(devlink);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->linecards_lock);
+ devlink_put(devlink);
+ }
+out:
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static struct devlink_linecard_type *
+devlink_linecard_type_lookup(struct devlink_linecard *linecard,
+ const char *type)
+{
+ struct devlink_linecard_type *linecard_type;
+ int i;
+
+ for (i = 0; i < linecard->types_count; i++) {
+ linecard_type = &linecard->types[i];
+ if (!strcmp(type, linecard_type->type))
+ return linecard_type;
+ }
+ return NULL;
+}
+
+static int devlink_linecard_type_set(struct devlink_linecard *linecard,
+ const char *type,
+ struct netlink_ext_ack *extack)
+{
+ const struct devlink_linecard_ops *ops = linecard->ops;
+ struct devlink_linecard_type *linecard_type;
+ int err;
+
+ mutex_lock(&linecard->state_lock);
+ if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
+ NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned");
+ err = -EBUSY;
+ goto out;
+ }
+ if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
+ NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned");
+ err = -EBUSY;
+ goto out;
+ }
+
+ linecard_type = devlink_linecard_type_lookup(linecard, type);
+ if (!linecard_type) {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported line card type provided");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED &&
+ linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
+ NL_SET_ERR_MSG_MOD(extack, "Line card already provisioned");
+ err = -EBUSY;
+ /* Check if the line card is provisioned in the same
+ * way the user asks. In case it is, make the operation
+ * to return success.
+ */
+ if (ops->same_provision &&
+ ops->same_provision(linecard, linecard->priv,
+ linecard_type->type,
+ linecard_type->priv))
+ err = 0;
+ goto out;
+ }
+
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING;
+ linecard->type = linecard_type->type;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ err = ops->provision(linecard, linecard->priv, linecard_type->type,
+ linecard_type->priv, extack);
+ if (err) {
+ /* Provisioning failed. Assume the linecard is unprovisioned
+ * for future operations.
+ */
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ }
+ return err;
+
+out:
+ mutex_unlock(&linecard->state_lock);
+ return err;
+}
+
+static int devlink_linecard_type_unset(struct devlink_linecard *linecard,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ mutex_lock(&linecard->state_lock);
+ if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
+ NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned");
+ err = -EBUSY;
+ goto out;
+ }
+ if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
+ NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned");
+ err = -EBUSY;
+ goto out;
+ }
+ if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ err = 0;
+ goto out;
+ }
+
+ if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) {
+ NL_SET_ERR_MSG_MOD(extack, "Line card is not provisioned");
+ err = 0;
+ goto out;
+ }
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ err = linecard->ops->unprovision(linecard, linecard->priv,
+ extack);
+ if (err) {
+ /* Unprovisioning failed. Assume the linecard is unprovisioned
+ * for future operations.
+ */
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ }
+ return err;
+
+out:
+ mutex_unlock(&linecard->state_lock);
+ return err;
+}
+
+static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_linecard *linecard = info->user_ptr[1];
+ struct netlink_ext_ack *extack = info->extack;
+ int err;
+
+ if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) {
+ const char *type;
+
+ type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]);
+ if (strcmp(type, "")) {
+ err = devlink_linecard_type_set(linecard, type, extack);
+ if (err)
+ return err;
+ } else {
+ err = devlink_linecard_type_unset(linecard, extack);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
}
static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
@@ -886,10 +2455,14 @@ static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct devlink_sb *devlink_sb;
struct sk_buff *msg;
int err;
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
@@ -911,14 +2484,12 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_sb *devlink_sb;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- mutex_lock(&devlink->lock);
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ devl_lock(devlink);
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
if (idx < start) {
idx++;
@@ -930,16 +2501,16 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err) {
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
goto out;
}
idx++;
}
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
}
out:
- mutex_unlock(&devlink_mutex);
-
cb->args[0] = idx;
return msg->len;
}
@@ -991,11 +2562,15 @@ static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct devlink_sb *devlink_sb;
struct sk_buff *msg;
u16 pool_index;
int err;
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
&pool_index);
if (err)
@@ -1051,30 +2626,33 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_sb *devlink_sb;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
- !devlink->ops->sb_pool_get)
- continue;
- mutex_lock(&devlink->lock);
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ if (!devlink->ops->sb_pool_get)
+ goto retry;
+
+ devl_lock(devlink);
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
devlink_sb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq);
- if (err && err != -EOPNOTSUPP) {
- mutex_unlock(&devlink->lock);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
goto out;
}
}
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+retry:
+ devlink_put(devlink);
}
out:
- mutex_unlock(&devlink_mutex);
-
if (err != -EMSGSIZE)
return err;
@@ -1100,12 +2678,16 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
enum devlink_sb_threshold_type threshold_type;
+ struct devlink_sb *devlink_sb;
u16 pool_index;
u32 size;
int err;
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
&pool_index);
if (err)
@@ -1115,7 +2697,7 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb,
if (err)
return err;
- if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE))
return -EINVAL;
size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
@@ -1164,7 +2746,7 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
pool_index, &cur, &max);
if (err && err != -EOPNOTSUPP)
- return err;
+ goto sb_occ_get_failure;
if (!err) {
if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
goto nla_put_failure;
@@ -1177,20 +2759,26 @@ static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
return 0;
nla_put_failure:
+ err = -EMSGSIZE;
+sb_occ_get_failure:
genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
+ return err;
}
static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = devlink_port->devlink;
- struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct devlink_sb *devlink_sb;
struct sk_buff *msg;
u16 pool_index;
int err;
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
&pool_index);
if (err)
@@ -1252,30 +2840,33 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_sb *devlink_sb;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
- !devlink->ops->sb_port_pool_get)
- continue;
- mutex_lock(&devlink->lock);
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ if (!devlink->ops->sb_port_pool_get)
+ goto retry;
+
+ devl_lock(devlink);
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
err = __sb_port_pool_get_dumpit(msg, start, &idx,
devlink, devlink_sb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq);
- if (err && err != -EOPNOTSUPP) {
- mutex_unlock(&devlink->lock);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
goto out;
}
}
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+retry:
+ devlink_put(devlink);
}
out:
- mutex_unlock(&devlink_mutex);
-
if (err != -EMSGSIZE)
return err;
@@ -1300,18 +2891,23 @@ static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,
static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb;
u16 pool_index;
u32 threshold;
int err;
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
&pool_index);
if (err)
return err;
- if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
return -EINVAL;
threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
@@ -1387,14 +2983,18 @@ nla_put_failure:
static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
+ struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = devlink_port->devlink;
- struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct devlink_sb *devlink_sb;
struct sk_buff *msg;
enum devlink_sb_pool_type pool_type;
u16 tc_index;
int err;
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
err = devlink_sb_pool_type_get_from_info(info, &pool_type);
if (err)
return err;
@@ -1482,32 +3082,34 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_sb *devlink_sb;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
- !devlink->ops->sb_tc_pool_bind_get)
- continue;
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ if (!devlink->ops->sb_tc_pool_bind_get)
+ goto retry;
- mutex_lock(&devlink->lock);
+ devl_lock(devlink);
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx,
devlink,
devlink_sb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq);
- if (err && err != -EOPNOTSUPP) {
- mutex_unlock(&devlink->lock);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
goto out;
}
}
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+retry:
+ devlink_put(devlink);
}
out:
- mutex_unlock(&devlink_mutex);
-
if (err != -EMSGSIZE)
return err;
@@ -1534,14 +3136,19 @@ static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
enum devlink_sb_pool_type pool_type;
+ struct devlink_sb *devlink_sb;
u16 tc_index;
u16 pool_index;
u32 threshold;
int err;
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
err = devlink_sb_pool_type_get_from_info(info, &pool_type);
if (err)
return err;
@@ -1556,7 +3163,7 @@ static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
if (err)
return err;
- if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
return -EINVAL;
threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
@@ -1569,8 +3176,12 @@ static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
const struct devlink_ops *ops = devlink->ops;
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
if (ops->sb_occ_snapshot)
return ops->sb_occ_snapshot(devlink, devlink_sb->index);
@@ -1581,8 +3192,12 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
const struct devlink_ops *ops = devlink->ops;
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
if (ops->sb_occ_max_clear)
return ops->sb_occ_max_clear(devlink, devlink_sb->index);
@@ -1666,6 +3281,19 @@ static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
return genlmsg_reply(msg, info);
}
+static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_rate *devlink_rate;
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list)
+ if (devlink_rate_is_node(devlink_rate)) {
+ NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists.");
+ return -EBUSY;
+ }
+ return 0;
+}
+
static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
@@ -1680,6 +3308,9 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
if (!ops->eswitch_mode_set)
return -EOPNOTSUPP;
mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = devlink_rate_nodes_check(devlink, mode, info->extack);
+ if (err)
+ return err;
err = ops->eswitch_mode_set(devlink, mode, info->extack);
if (err)
return err;
@@ -2182,7 +3813,7 @@ void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
kfree(value[value_index].mask);
}
}
-EXPORT_SYMBOL(devlink_dpipe_entry_clear);
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_clear);
static int devlink_dpipe_entries_fill(struct genl_info *info,
enum devlink_command cmd, int flags,
@@ -2221,7 +3852,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
struct devlink_dpipe_table *table;
const char *table_name;
- if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME))
return -EINVAL;
table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
@@ -2405,8 +4036,9 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
const char *table_name;
bool counters_enable;
- if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME] ||
- !info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) ||
+ GENL_REQ_ATTR_CHECK(info,
+ DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED))
return -EINVAL;
table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
@@ -2495,8 +4127,8 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
u64 size;
int err;
- if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
- !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) ||
+ GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE))
return -EINVAL;
resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
@@ -2709,7 +4341,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb,
struct net *net;
if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
- NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified");
+ NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified");
return ERR_PTR(-EINVAL);
}
@@ -2727,7 +4359,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb,
net = ERR_PTR(-EINVAL);
}
if (IS_ERR(net)) {
- NL_SET_ERR_MSG(info->extack, "Unknown network namespace");
+ NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace");
return ERR_PTR(-EINVAL);
}
if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
@@ -2742,10 +4374,12 @@ static void devlink_param_notify(struct devlink *devlink,
struct devlink_param_item *param_item,
enum devlink_command cmd);
-static void devlink_reload_netns_change(struct devlink *devlink,
- struct net *dest_net)
+static void devlink_ns_change_notify(struct devlink *devlink,
+ struct net *dest_net, struct net *curr_net,
+ bool new)
{
struct devlink_param_item *param_item;
+ enum devlink_command cmd;
/* Userspace needs to be notified about devlink objects
* removed from original and entering new network namespace.
@@ -2753,22 +4387,23 @@ static void devlink_reload_netns_change(struct devlink *devlink,
* reload process so the notifications are generated separatelly.
*/
- list_for_each_entry(param_item, &devlink->param_list, list)
- devlink_param_notify(devlink, 0, param_item,
- DEVLINK_CMD_PARAM_DEL);
- devlink_notify(devlink, DEVLINK_CMD_DEL);
+ if (!dest_net || net_eq(dest_net, curr_net))
+ return;
- __devlink_net_set(devlink, dest_net);
+ if (new)
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
- devlink_notify(devlink, DEVLINK_CMD_NEW);
+ cmd = new ? DEVLINK_CMD_PARAM_NEW : DEVLINK_CMD_PARAM_DEL;
list_for_each_entry(param_item, &devlink->param_list, list)
- devlink_param_notify(devlink, 0, param_item,
- DEVLINK_CMD_PARAM_NEW);
+ devlink_param_notify(devlink, 0, param_item, cmd);
+
+ if (!new)
+ devlink_notify(devlink, DEVLINK_CMD_DEL);
}
-static bool devlink_reload_supported(struct devlink *devlink)
+static bool devlink_reload_supported(const struct devlink_ops *ops)
{
- return devlink->ops->reload_down && devlink->ops->reload_up;
+ return ops->reload_down && ops->reload_up;
}
static void devlink_reload_failed_set(struct devlink *devlink,
@@ -2786,33 +4421,134 @@ bool devlink_is_reload_failed(const struct devlink *devlink)
}
EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
+static void
+__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats,
+ enum devlink_reload_limit limit, u32 actions_performed)
+{
+ unsigned long actions = actions_performed;
+ int stat_idx;
+ int action;
+
+ for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) {
+ stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action;
+ reload_stats[stat_idx]++;
+ }
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static void
+devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit,
+ actions_performed);
+}
+
+/**
+ * devlink_remote_reload_actions_performed - Update devlink on reload actions
+ * performed which are not a direct result of devlink reload call.
+ *
+ * This should be called by a driver after performing reload actions in case it was not
+ * a result of devlink reload call. For example fw_activate was performed as a result
+ * of devlink reload triggered fw_activate on another host.
+ * The motivation for this function is to keep data on reload actions performed on this
+ * function whether it was done due to direct devlink reload call or not.
+ *
+ * @devlink: devlink
+ * @limit: reload limit
+ * @actions_performed: bitmask of actions performed
+ */
+void devlink_remote_reload_actions_performed(struct devlink *devlink,
+ enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ if (WARN_ON(!actions_performed ||
+ actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) ||
+ limit > DEVLINK_RELOAD_LIMIT_MAX))
+ return;
+
+ __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit,
+ actions_performed);
+}
+EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed);
+
static int devlink_reload(struct devlink *devlink, struct net *dest_net,
- struct netlink_ext_ack *extack)
+ enum devlink_reload_action action, enum devlink_reload_limit limit,
+ u32 *actions_performed, struct netlink_ext_ack *extack)
{
+ u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+ struct net *curr_net;
int err;
- if (!devlink->reload_enabled)
- return -EOPNOTSUPP;
+ memcpy(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats));
- err = devlink->ops->reload_down(devlink, !!dest_net, extack);
+ curr_net = devlink_net(devlink);
+ devlink_ns_change_notify(devlink, dest_net, curr_net, false);
+ err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack);
if (err)
return err;
- if (dest_net && !net_eq(dest_net, devlink_net(devlink)))
- devlink_reload_netns_change(devlink, dest_net);
+ if (dest_net && !net_eq(dest_net, curr_net))
+ write_pnet(&devlink->_net, dest_net);
- err = devlink->ops->reload_up(devlink, extack);
+ err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack);
devlink_reload_failed_set(devlink, !!err);
- return err;
+ if (err)
+ return err;
+
+ devlink_ns_change_notify(devlink, dest_net, curr_net, true);
+ WARN_ON(!(*actions_performed & BIT(action)));
+ /* Catch driver on updating the remote action within devlink reload */
+ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats)));
+ devlink_reload_stats_update(devlink, limit, *actions_performed);
+ return 0;
+}
+
+static int
+devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed,
+ enum devlink_command cmd, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ goto free_msg;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed,
+ actions_performed))
+ goto nla_put_failure;
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
}
static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
struct net *dest_net = NULL;
+ u32 actions_performed;
int err;
- if (!devlink_reload_supported(devlink) || !devlink->reload_enabled)
+ if (!(devlink->features & DEVLINK_F_RELOAD))
return -EOPNOTSUPP;
err = devlink_resources_validate(devlink, NULL, info);
@@ -2821,6 +4557,48 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
return err;
}
+ if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION])
+ action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]);
+ else
+ action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT;
+
+ if (!devlink_reload_action_is_supported(devlink, action)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Requested reload action is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+
+ limit = DEVLINK_RELOAD_LIMIT_UNSPEC;
+ if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) {
+ struct nla_bitfield32 limits;
+ u32 limits_selected;
+
+ limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]);
+ limits_selected = limits.value & limits.selector;
+ if (!limits_selected) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected");
+ return -EINVAL;
+ }
+ for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++)
+ if (limits_selected & BIT(limit))
+ break;
+ /* UAPI enables multiselection, but currently it is not used */
+ if (limits_selected != BIT(limit)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Multiselection of limit is not supported");
+ return -EOPNOTSUPP;
+ }
+ if (!devlink_reload_limit_is_supported(devlink, limit)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Requested limit is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_reload_combination_is_invalid(action, limit)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Requested limit is invalid for this action");
+ return -EINVAL;
+ }
+ }
if (info->attrs[DEVLINK_ATTR_NETNS_PID] ||
info->attrs[DEVLINK_ATTR_NETNS_FD] ||
info->attrs[DEVLINK_ATTR_NETNS_ID]) {
@@ -2829,20 +4607,25 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
return PTR_ERR(dest_net);
}
- err = devlink_reload(devlink, dest_net, info->extack);
+ err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack);
if (dest_net)
put_net(dest_net);
- return err;
+ if (err)
+ return err;
+ /* For backward compatibility generate reply only if attributes used by user */
+ if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS])
+ return 0;
+
+ return devlink_nl_reload_actions_performed_snd(devlink, actions_performed,
+ DEVLINK_CMD_RELOAD, info);
}
static int devlink_nl_flash_update_fill(struct sk_buff *msg,
struct devlink *devlink,
enum devlink_command cmd,
- const char *status_msg,
- const char *component,
- unsigned long done, unsigned long total)
+ struct devlink_flash_notify *params)
{
void *hdr;
@@ -2856,19 +4639,22 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg,
if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
goto out;
- if (status_msg &&
+ if (params->status_msg &&
nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
- status_msg))
+ params->status_msg))
goto nla_put_failure;
- if (component &&
+ if (params->component &&
nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
- component))
+ params->component))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
- done, DEVLINK_ATTR_PAD))
+ params->done, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
- total, DEVLINK_ATTR_PAD))
+ params->total, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT,
+ params->timeout, DEVLINK_ATTR_PAD))
goto nla_put_failure;
out:
@@ -2882,10 +4668,7 @@ nla_put_failure:
static void __devlink_flash_update_notify(struct devlink *devlink,
enum devlink_command cmd,
- const char *status_msg,
- const char *component,
- unsigned long done,
- unsigned long total)
+ struct devlink_flash_notify *params)
{
struct sk_buff *msg;
int err;
@@ -2894,12 +4677,14 @@ static void __devlink_flash_update_notify(struct devlink *devlink,
cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
+ if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+ return;
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
- err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg,
- component, done, total);
+ err = devlink_nl_flash_update_fill(msg, devlink, cmd, params);
if (err)
goto out_free_msg;
@@ -2911,21 +4696,23 @@ out_free_msg:
nlmsg_free(msg);
}
-void devlink_flash_update_begin_notify(struct devlink *devlink)
+static void devlink_flash_update_begin_notify(struct devlink *devlink)
{
+ struct devlink_flash_notify params = {};
+
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE,
- NULL, NULL, 0, 0);
+ &params);
}
-EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify);
-void devlink_flash_update_end_notify(struct devlink *devlink)
+static void devlink_flash_update_end_notify(struct devlink *devlink)
{
+ struct devlink_flash_notify params = {};
+
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE_END,
- NULL, NULL, 0, 0);
+ &params);
}
-EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify);
void devlink_flash_update_status_notify(struct devlink *devlink,
const char *status_msg,
@@ -2933,31 +4720,350 @@ void devlink_flash_update_status_notify(struct devlink *devlink,
unsigned long done,
unsigned long total)
{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .done = done,
+ .total = total,
+ };
+
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE_STATUS,
- status_msg, component, done, total);
+ &params);
}
EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+void devlink_flash_update_timeout_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long timeout)
+{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .timeout = timeout,
+ };
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ &params);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify);
+
+struct devlink_info_req {
+ struct sk_buff *msg;
+ void (*version_cb)(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv);
+ void *version_cb_priv;
+};
+
+struct devlink_flash_component_lookup_ctx {
+ const char *lookup_name;
+ bool lookup_name_found;
+};
+
+static void
+devlink_flash_component_lookup_cb(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv)
+{
+ struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv;
+
+ if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT ||
+ lookup_ctx->lookup_name_found)
+ return;
+
+ lookup_ctx->lookup_name_found =
+ !strcmp(lookup_ctx->lookup_name, version_name);
+}
+
+static int devlink_flash_component_get(struct devlink *devlink,
+ struct nlattr *nla_component,
+ const char **p_component,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_flash_component_lookup_ctx lookup_ctx = {};
+ struct devlink_info_req req = {};
+ const char *component;
+ int ret;
+
+ if (!nla_component)
+ return 0;
+
+ component = nla_data(nla_component);
+
+ if (!devlink->ops->info_get) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "component update is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+
+ lookup_ctx.lookup_name = component;
+ req.version_cb = devlink_flash_component_lookup_cb;
+ req.version_cb_priv = &lookup_ctx;
+
+ ret = devlink->ops->info_get(devlink, &req, NULL);
+ if (ret)
+ return ret;
+
+ if (!lookup_ctx.lookup_name_found) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "selected component is not supported by this device");
+ return -EINVAL;
+ }
+ *p_component = component;
+ return 0;
+}
+
static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
struct genl_info *info)
{
+ struct nlattr *nla_overwrite_mask, *nla_file_name;
+ struct devlink_flash_update_params params = {};
struct devlink *devlink = info->user_ptr[0];
- const char *file_name, *component;
- struct nlattr *nla_component;
+ const char *file_name;
+ u32 supported_params;
+ int ret;
if (!devlink->ops->flash_update)
return -EOPNOTSUPP;
- if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME))
return -EINVAL;
- file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]);
- nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT];
- component = nla_component ? nla_data(nla_component) : NULL;
+ ret = devlink_flash_component_get(devlink,
+ info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT],
+ &params.component, info->extack);
+ if (ret)
+ return ret;
+
+ supported_params = devlink->ops->supported_flash_update_params;
+
+ nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK];
+ if (nla_overwrite_mask) {
+ struct nla_bitfield32 sections;
+
+ if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask,
+ "overwrite settings are not supported by this device");
+ return -EOPNOTSUPP;
+ }
+ sections = nla_get_bitfield32(nla_overwrite_mask);
+ params.overwrite_mask = sections.value & sections.selector;
+ }
+
+ nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME];
+ file_name = nla_data(nla_file_name);
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file");
+ return ret;
+ }
+
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, info->extack);
+ devlink_flash_update_end_notify(devlink);
+
+ release_firmware(params.fw);
+
+ return ret;
+}
+
+static int
+devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink,
+ u32 portid, u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *selftests;
+ void *hdr;
+ int err;
+ int i;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags,
+ DEVLINK_CMD_SELFTESTS_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto err_cancel_msg;
+
+ selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+ if (!selftests)
+ goto err_cancel_msg;
+
+ for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+ i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+ if (devlink->ops->selftest_check(devlink, i, extack)) {
+ err = nla_put_flag(msg, i);
+ if (err)
+ goto err_cancel_msg;
+ }
+ }
+
+ nla_nest_end(msg, selftests);
+ genlmsg_end(msg, hdr);
+ return 0;
- return devlink->ops->flash_update(devlink, file_name, component,
- info->extack);
+err_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink->ops->selftest_check)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid,
+ info->snd_seq, 0, info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_selftests_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ int start = cb->args[0];
+ unsigned long index;
+ int idx = 0;
+ int err = 0;
+
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ if (idx < start || !devlink->ops->selftest_check)
+ goto inc;
+
+ devl_lock(devlink);
+ err = devlink_nl_selftests_fill(msg, devlink,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->extack);
+ devl_unlock(devlink);
+ if (err) {
+ devlink_put(devlink);
+ break;
+ }
+inc:
+ idx++;
+ devlink_put(devlink);
+ }
+
+ if (err != -EMSGSIZE)
+ return err;
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id,
+ enum devlink_selftest_status test_status)
+{
+ struct nlattr *result_attr;
+
+ result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT);
+ if (!result_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS,
+ test_status))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, result_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, result_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_selftests_run(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1];
+ struct devlink *devlink = info->user_ptr[0];
+ struct nlattr *attrs, *selftests;
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+ int i;
+
+ if (!devlink->ops->selftest_run || !devlink->ops->selftest_check)
+ return -EOPNOTSUPP;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS))
+ return -EINVAL;
+
+ attrs = info->attrs[DEVLINK_ATTR_SELFTESTS];
+
+ err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs,
+ devlink_selftest_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = -EMSGSIZE;
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN);
+ if (!hdr)
+ goto free_msg;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+
+ selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+ if (!selftests)
+ goto genlmsg_cancel;
+
+ for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+ i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+ enum devlink_selftest_status test_status;
+
+ if (nla_get_flag(tb[i])) {
+ if (!devlink->ops->selftest_check(devlink, i,
+ info->extack)) {
+ if (devlink_selftest_result_put(msg, i,
+ DEVLINK_SELFTEST_STATUS_SKIP))
+ goto selftests_nest_cancel;
+ continue;
+ }
+
+ test_status = devlink->ops->selftest_run(devlink, i,
+ info->extack);
+ if (devlink_selftest_result_put(msg, i, test_status))
+ goto selftests_nest_cancel;
+ }
+ }
+
+ nla_nest_end(msg, selftests);
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+selftests_nest_cancel:
+ nla_nest_cancel(msg, selftests);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return err;
}
static const struct devlink_param devlink_param_generic[] = {
@@ -3011,6 +5117,41 @@ static const struct devlink_param devlink_param_generic[] = {
.name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE,
},
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE,
+ .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE,
+ .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE,
+ },
};
static int devlink_param_generic_verify(const struct devlink_param *param)
@@ -3074,7 +5215,7 @@ static int devlink_param_get(struct devlink *devlink,
const struct devlink_param *param,
struct devlink_param_gset_ctx *ctx)
{
- if (!param->get)
+ if (!param->get || devlink->reload_failed)
return -EOPNOTSUPP;
return param->get(devlink, param->id, ctx);
}
@@ -3083,7 +5224,7 @@ static int devlink_param_set(struct devlink *devlink,
const struct devlink_param *param,
struct devlink_param_gset_ctx *ctx)
{
- if (!param->set)
+ if (!param->set || devlink->reload_failed)
return -EOPNOTSUPP;
return param->set(devlink, param->id, ctx);
}
@@ -3183,8 +5324,6 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
return -EOPNOTSUPP;
param_value[i] = param_item->driverinit_value;
} else {
- if (!param_item->published)
- continue;
ctx.cmode = i;
err = devlink_param_get(devlink, param, &ctx);
if (err)
@@ -3260,6 +5399,7 @@ static void devlink_param_notify(struct devlink *devlink,
WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL &&
cmd != DEVLINK_CMD_PORT_PARAM_NEW &&
cmd != DEVLINK_CMD_PORT_PARAM_DEL);
+ ASSERT_DEVLINK_REGISTERED(devlink);
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
@@ -3281,14 +5421,12 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
struct devlink_param_item *param_item;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- mutex_lock(&devlink->lock);
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ devl_lock(devlink);
list_for_each_entry(param_item, &devlink->param_list, list) {
if (idx < start) {
idx++;
@@ -3299,17 +5437,19 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
- if (err && err != -EOPNOTSUPP) {
- mutex_unlock(&devlink->lock);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
goto out;
}
idx++;
}
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
}
out:
- mutex_unlock(&devlink_mutex);
-
if (err != -EMSGSIZE)
return err;
@@ -3321,7 +5461,7 @@ static int
devlink_param_type_get_from_info(struct genl_info *info,
enum devlink_param_type *param_type)
{
- if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE))
return -EINVAL;
switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
@@ -3398,7 +5538,7 @@ devlink_param_get_from_info(struct list_head *param_list,
{
char *param_name;
- if (!info->attrs[DEVLINK_ATTR_PARAM_NAME])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME))
return NULL;
param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
@@ -3464,7 +5604,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
return err;
}
- if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE))
return -EINVAL;
cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
if (!devlink_param_cmode_is_supported(param, cmode))
@@ -3499,133 +5639,25 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
info, DEVLINK_CMD_PARAM_NEW);
}
-static int devlink_param_register_one(struct devlink *devlink,
- unsigned int port_index,
- struct list_head *param_list,
- const struct devlink_param *param,
- enum devlink_command cmd)
-{
- struct devlink_param_item *param_item;
-
- if (devlink_param_find_by_name(param_list, param->name))
- return -EEXIST;
-
- if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
- WARN_ON(param->get || param->set);
- else
- WARN_ON(!param->get || !param->set);
-
- param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
- if (!param_item)
- return -ENOMEM;
- param_item->param = param;
-
- list_add_tail(&param_item->list, param_list);
- devlink_param_notify(devlink, port_index, param_item, cmd);
- return 0;
-}
-
-static void devlink_param_unregister_one(struct devlink *devlink,
- unsigned int port_index,
- struct list_head *param_list,
- const struct devlink_param *param,
- enum devlink_command cmd)
-{
- struct devlink_param_item *param_item;
-
- param_item = devlink_param_find_by_name(param_list, param->name);
- WARN_ON(!param_item);
- devlink_param_notify(devlink, port_index, param_item, cmd);
- list_del(&param_item->list);
- kfree(param_item);
-}
-
static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
struct netlink_callback *cb)
{
- struct devlink_param_item *param_item;
- struct devlink_port *devlink_port;
- struct devlink *devlink;
- int start = cb->args[0];
- int idx = 0;
- int err = 0;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- mutex_lock(&devlink->lock);
- list_for_each_entry(devlink_port, &devlink->port_list, list) {
- list_for_each_entry(param_item,
- &devlink_port->param_list, list) {
- if (idx < start) {
- idx++;
- continue;
- }
- err = devlink_nl_param_fill(msg,
- devlink_port->devlink,
- devlink_port->index, param_item,
- DEVLINK_CMD_PORT_PARAM_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
- if (err && err != -EOPNOTSUPP) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- idx++;
- }
- }
- mutex_unlock(&devlink->lock);
- }
-out:
- mutex_unlock(&devlink_mutex);
-
- if (err != -EMSGSIZE)
- return err;
-
- cb->args[0] = idx;
+ NL_SET_ERR_MSG_MOD(cb->extack, "Port params are not supported");
return msg->len;
}
static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink_param_item *param_item;
- struct sk_buff *msg;
- int err;
-
- param_item = devlink_param_get_from_info(&devlink_port->param_list,
- info);
- if (!param_item)
- return -EINVAL;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_param_fill(msg, devlink_port->devlink,
- devlink_port->index, param_item,
- DEVLINK_CMD_PORT_PARAM_GET,
- info->snd_portid, info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
+ NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported");
+ return -EINVAL;
}
static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[0];
-
- return __devlink_nl_cmd_param_set_doit(devlink_port->devlink,
- devlink_port->index,
- &devlink_port->param_list, info,
- DEVLINK_CMD_PORT_PARAM_NEW);
+ NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported");
+ return -EINVAL;
}
static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
@@ -3694,7 +5726,14 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
if (err)
goto nla_put_failure;
- err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
+ if (region->port) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name);
if (err)
goto nla_put_failure;
@@ -3704,6 +5743,11 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
if (err)
goto nla_put_failure;
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS,
+ region->max_snapshots);
+ if (err)
+ goto nla_put_failure;
+
err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
if (err)
goto nla_put_failure;
@@ -3716,31 +5760,40 @@ nla_put_failure:
return err;
}
-static void devlink_nl_region_notify(struct devlink_region *region,
- struct devlink_snapshot *snapshot,
- enum devlink_command cmd)
+static struct sk_buff *
+devlink_nl_region_notify_build(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd, u32 portid, u32 seq)
{
struct devlink *devlink = region->devlink;
struct sk_buff *msg;
void *hdr;
int err;
- WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
- return;
+ return ERR_PTR(-ENOMEM);
- hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
- if (!hdr)
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
goto out_free_msg;
+ }
err = devlink_nl_put_handle(msg, devlink);
if (err)
goto out_cancel_msg;
+ if (region->port) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto out_cancel_msg;
+ }
+
err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
- region->name);
+ region->ops->name);
if (err)
goto out_cancel_msg;
@@ -3757,24 +5810,239 @@ static void devlink_nl_region_notify(struct devlink_region *region,
}
genlmsg_end(msg, hdr);
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
-
- return;
+ return msg;
out_cancel_msg:
genlmsg_cancel(msg, hdr);
out_free_msg:
nlmsg_free(msg);
+ return ERR_PTR(err);
+}
+
+static void devlink_nl_region_notify(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = region->devlink;
+ struct sk_buff *msg;
+
+ WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
+ if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+ return;
+
+ msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0);
+ if (IS_ERR(msg))
+ return;
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg,
+ 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+/**
+ * __devlink_snapshot_id_increment - Increment number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a new snapshot begins using an id. Load the count for the
+ * given id from the snapshot xarray, increment it, and store it back.
+ *
+ * Called when a new snapshot is created with the given id.
+ *
+ * The id *must* have been previously allocated by
+ * devlink_region_snapshot_id_get().
+ *
+ * Returns 0 on success, or an error on failure.
+ */
+static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+ int err;
+
+ xa_lock(&devlink->snapshot_ids);
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ if (WARN_ON(!xa_is_value(p))) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ count = xa_to_value(p);
+ count++;
+
+ err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_ATOMIC));
+unlock:
+ xa_unlock(&devlink->snapshot_ids);
+ return err;
+}
+
+/**
+ * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a snapshot is deleted and stops using an id. Load the count
+ * for the given id from the snapshot xarray, decrement it, and store it
+ * back.
+ *
+ * If the count reaches zero, erase this id from the xarray, freeing it
+ * up for future re-use by devlink_region_snapshot_id_get().
+ *
+ * Called when a snapshot using the given id is deleted, and when the
+ * initial allocator of the id is finished using it.
+ */
+static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+
+ xa_lock(&devlink->snapshot_ids);
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p))
+ goto unlock;
+
+ if (WARN_ON(!xa_is_value(p)))
+ goto unlock;
+
+ count = xa_to_value(p);
+
+ if (count > 1) {
+ count--;
+ __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_ATOMIC);
+ } else {
+ /* If this was the last user, we can erase this id */
+ __xa_erase(&devlink->snapshot_ids, id);
+ }
+unlock:
+ xa_unlock(&devlink->snapshot_ids);
+}
+
+/**
+ * __devlink_snapshot_id_insert - Insert a specific snapshot ID
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Mark the given snapshot id as used by inserting a zero value into the
+ * snapshot xarray.
+ *
+ * This must be called while holding the devlink instance lock. Unlike
+ * devlink_snapshot_id_get, the initial reference count is zero, not one.
+ * It is expected that the id will immediately be used before
+ * releasing the devlink instance lock.
+ *
+ * Returns zero on success, or an error code if the snapshot id could not
+ * be inserted.
+ */
+static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id)
+{
+ int err;
+
+ xa_lock(&devlink->snapshot_ids);
+ if (xa_load(&devlink->snapshot_ids, id)) {
+ xa_unlock(&devlink->snapshot_ids);
+ return -EEXIST;
+ }
+ err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0),
+ GFP_ATOMIC));
+ xa_unlock(&devlink->snapshot_ids);
+ return err;
+}
+
+/**
+ * __devlink_region_snapshot_id_get - get snapshot ID
+ * @devlink: devlink instance
+ * @id: storage to return snapshot id
+ *
+ * Allocates a new snapshot id. Returns zero on success, or a negative
+ * error on failure. Must be called while holding the devlink instance
+ * lock.
+ *
+ * Snapshot IDs are tracked using an xarray which stores the number of
+ * users of the snapshot id.
+ *
+ * Note that the caller of this function counts as a 'user', in order to
+ * avoid race conditions. The caller must release its hold on the
+ * snapshot by using devlink_region_snapshot_id_put.
+ */
+static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
+{
+ return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1),
+ xa_limit_32b, GFP_KERNEL);
+}
+
+/**
+ * __devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * Must be called only while holding the region snapshot lock.
+ *
+ * @region: devlink region of the snapshot
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ */
+static int
+__devlink_region_snapshot_create(struct devlink_region *region,
+ u8 *data, u32 snapshot_id)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot;
+ int err;
+
+ lockdep_assert_held(&region->snapshot_lock);
+
+ /* check if region can hold one more snapshot */
+ if (region->cur_snapshots == region->max_snapshots)
+ return -ENOSPC;
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id))
+ return -EEXIST;
+
+ snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+ if (!snapshot)
+ return -ENOMEM;
+
+ err = __devlink_snapshot_id_increment(devlink, snapshot_id);
+ if (err)
+ goto err_snapshot_id_increment;
+
+ snapshot->id = snapshot_id;
+ snapshot->region = region;
+ snapshot->data = data;
+
+ list_add_tail(&snapshot->list, &region->snapshot_list);
+
+ region->cur_snapshots++;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ return 0;
+
+err_snapshot_id_increment:
+ kfree(snapshot);
+ return err;
}
static void devlink_region_snapshot_del(struct devlink_region *region,
struct devlink_snapshot *snapshot)
{
+ struct devlink *devlink = region->devlink;
+
+ lockdep_assert_held(&region->snapshot_lock);
+
devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
region->cur_snapshots--;
list_del(&snapshot->list);
- (*snapshot->data_destructor)(snapshot->data);
+ region->ops->destructor(snapshot->data);
+ __devlink_snapshot_id_decrement(devlink, snapshot->id);
kfree(snapshot);
}
@@ -3782,16 +6050,30 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *port = NULL;
struct devlink_region *region;
const char *region_name;
struct sk_buff *msg;
+ unsigned int index;
int err;
- if (!info->attrs[DEVLINK_ATTR_REGION_NAME])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME))
return -EINVAL;
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
- region = devlink_region_get_by_name(devlink, region_name);
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
if (!region)
return -EINVAL;
@@ -3810,41 +6092,89 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
return genlmsg_reply(msg, info);
}
+static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ struct devlink_port *port,
+ int *idx,
+ int start)
+{
+ struct devlink_region *region;
+ int err = 0;
+
+ list_for_each_entry(region, &port->region_list, list) {
+ if (*idx < start) {
+ (*idx)++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, port->devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, region);
+ if (err)
+ goto out;
+ (*idx)++;
+ }
+
+out:
+ return err;
+}
+
+static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ struct devlink *devlink,
+ int *idx,
+ int start)
+{
+ struct devlink_region *region;
+ struct devlink_port *port;
+ int err = 0;
+
+ devl_lock(devlink);
+ list_for_each_entry(region, &devlink->region_list, list) {
+ if (*idx < start) {
+ (*idx)++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, region);
+ if (err)
+ goto out;
+ (*idx)++;
+ }
+
+ list_for_each_entry(port, &devlink->port_list, list) {
+ err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, idx,
+ start);
+ if (err)
+ goto out;
+ }
+
+out:
+ devl_unlock(devlink);
+ return err;
+}
+
static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
struct netlink_callback *cb)
{
- struct devlink_region *region;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
- int err;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
+ int err = 0;
- mutex_lock(&devlink->lock);
- list_for_each_entry(region, &devlink->region_list, list) {
- if (idx < start) {
- idx++;
- continue;
- }
- err = devlink_nl_region_fill(msg, devlink,
- DEVLINK_CMD_REGION_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI, region);
- if (err) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- idx++;
- }
- mutex_unlock(&devlink->lock);
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink,
+ &idx, start);
+ devlink_put(devlink);
+ if (err)
+ goto out;
}
out:
- mutex_unlock(&devlink_mutex);
cb->args[0] = idx;
return msg->len;
}
@@ -3854,29 +6184,173 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb,
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_snapshot *snapshot;
+ struct devlink_port *port = NULL;
struct devlink_region *region;
const char *region_name;
+ unsigned int index;
u32 snapshot_id;
- if (!info->attrs[DEVLINK_ATTR_REGION_NAME] ||
- !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) ||
+ GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID))
return -EINVAL;
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
- region = devlink_region_get_by_name(devlink, region_name);
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
if (!region)
return -EINVAL;
+ mutex_lock(&region->snapshot_lock);
snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
- if (!snapshot)
+ if (!snapshot) {
+ mutex_unlock(&region->snapshot_lock);
return -EINVAL;
+ }
devlink_region_snapshot_del(region, snapshot);
+ mutex_unlock(&region->snapshot_lock);
return 0;
}
+static int
+devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_snapshot *snapshot;
+ struct devlink_port *port = NULL;
+ struct nlattr *snapshot_id_attr;
+ struct devlink_region *region;
+ const char *region_name;
+ unsigned int index;
+ u32 snapshot_id;
+ u8 *data;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "No region name provided");
+ return -EINVAL;
+ }
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
+ if (!region) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist");
+ return -EINVAL;
+ }
+
+ if (!region->ops->snapshot) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not support taking an immediate snapshot");
+ return -EOPNOTSUPP;
+ }
+
+ mutex_lock(&region->snapshot_lock);
+
+ if (region->cur_snapshots == region->max_snapshots) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots");
+ err = -ENOSPC;
+ goto unlock;
+ }
+
+ snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
+ if (snapshot_id_attr) {
+ snapshot_id = nla_get_u32(snapshot_id_attr);
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use");
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ err = __devlink_snapshot_id_insert(devlink, snapshot_id);
+ if (err)
+ goto unlock;
+ } else {
+ err = __devlink_region_snapshot_id_get(devlink, &snapshot_id);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id");
+ goto unlock;
+ }
+ }
+
+ if (port)
+ err = region->port_ops->snapshot(port, region->port_ops,
+ info->extack, &data);
+ else
+ err = region->ops->snapshot(devlink, region->ops,
+ info->extack, &data);
+ if (err)
+ goto err_snapshot_capture;
+
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
+ if (err)
+ goto err_snapshot_create;
+
+ if (!snapshot_id_attr) {
+ struct sk_buff *msg;
+
+ snapshot = devlink_region_snapshot_get_by_id(region,
+ snapshot_id);
+ if (WARN_ON(!snapshot)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ msg = devlink_nl_region_notify_build(region, snapshot,
+ DEVLINK_CMD_REGION_NEW,
+ info->snd_portid,
+ info->snd_seq);
+ err = PTR_ERR_OR_ZERO(msg);
+ if (err)
+ goto err_notify;
+
+ err = genlmsg_reply(msg, info);
+ if (err)
+ goto err_notify;
+ }
+
+ mutex_unlock(&region->snapshot_lock);
+ return 0;
+
+err_snapshot_create:
+ region->ops->destructor(data);
+err_snapshot_capture:
+ __devlink_snapshot_id_decrement(devlink, snapshot_id);
+ mutex_unlock(&region->snapshot_lock);
+ return err;
+
+err_notify:
+ devlink_region_snapshot_del(region, snapshot);
+unlock:
+ mutex_unlock(&region->snapshot_lock);
+ return err;
+}
+
static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
struct devlink *devlink,
u8 *chunk, u32 chunk_size,
@@ -3914,7 +6388,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
struct nlattr **attrs,
u64 start_offset,
u64 end_offset,
- bool dump,
u64 *new_offset)
{
struct devlink_snapshot *snapshot;
@@ -3929,9 +6402,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
if (!snapshot)
return -EINVAL;
- if (end_offset > region->size || dump)
- end_offset = region->size;
-
while (curr_offset < end_offset) {
u32 data_size;
u8 *data;
@@ -3959,26 +6429,24 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct genl_dumpit_info *info = genl_dumpit_info(cb);
- u64 ret_offset, start_offset, end_offset = 0;
+ u64 ret_offset, start_offset, end_offset = U64_MAX;
struct nlattr **attrs = info->attrs;
+ struct devlink_port *port = NULL;
struct devlink_region *region;
struct nlattr *chunks_attr;
const char *region_name;
struct devlink *devlink;
- bool dump = true;
+ unsigned int index;
void *hdr;
int err;
start_offset = *((u64 *)&cb->args[0]);
- mutex_lock(&devlink_mutex);
devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
- if (IS_ERR(devlink)) {
- err = PTR_ERR(devlink);
- goto out_dev;
- }
+ if (IS_ERR(devlink))
+ return PTR_ERR(devlink);
- mutex_lock(&devlink->lock);
+ devl_lock(devlink);
if (!attrs[DEVLINK_ATTR_REGION_NAME] ||
!attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) {
@@ -3986,15 +6454,43 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
goto out_unlock;
}
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);
- region = devlink_region_get_by_name(devlink, region_name);
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
if (!region) {
err = -EINVAL;
goto out_unlock;
}
+ if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
+ attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
+ if (!start_offset)
+ start_offset =
+ nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+
+ end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+ end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
+ }
+
+ if (end_offset > region->size)
+ end_offset = region->size;
+
/* return 0 if there is no further data to read */
- if (start_offset >= region->size) {
+ if (start_offset == end_offset) {
err = 0;
goto out_unlock;
}
@@ -4011,6 +6507,13 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
if (err)
goto nla_put_failure;
+ if (region->port) {
+ err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto nla_put_failure;
+ }
+
err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
if (err)
goto nla_put_failure;
@@ -4021,22 +6524,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
goto nla_put_failure;
}
- if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
- attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
- if (!start_offset)
- start_offset =
- nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
-
- end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
- end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
- dump = false;
- }
-
err = devlink_nl_region_read_snapshot_fill(skb, devlink,
region, attrs,
start_offset,
- end_offset, dump,
- &ret_offset);
+ end_offset, &ret_offset);
if (err && err != -EMSGSIZE)
goto nla_put_failure;
@@ -4051,43 +6542,59 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
nla_nest_end(skb, chunks_attr);
genlmsg_end(skb, hdr);
- mutex_unlock(&devlink->lock);
- mutex_unlock(&devlink_mutex);
-
+ devl_unlock(devlink);
+ devlink_put(devlink);
return skb->len;
nla_put_failure:
genlmsg_cancel(skb, hdr);
out_unlock:
- mutex_unlock(&devlink->lock);
-out_dev:
- mutex_unlock(&devlink_mutex);
+ devl_unlock(devlink);
+ devlink_put(devlink);
return err;
}
-struct devlink_info_req {
- struct sk_buff *msg;
-};
-
int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name)
{
+ if (!req->msg)
+ return 0;
return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name);
}
EXPORT_SYMBOL_GPL(devlink_info_driver_name_put);
int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
{
+ if (!req->msg)
+ return 0;
return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn);
}
EXPORT_SYMBOL_GPL(devlink_info_serial_number_put);
+int devlink_info_board_serial_number_put(struct devlink_info_req *req,
+ const char *bsn)
+{
+ if (!req->msg)
+ return 0;
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER,
+ bsn);
+}
+EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put);
+
static int devlink_info_version_put(struct devlink_info_req *req, int attr,
const char *version_name,
- const char *version_value)
+ const char *version_value,
+ enum devlink_info_version_type version_type)
{
struct nlattr *nest;
int err;
+ if (req->version_cb)
+ req->version_cb(version_name, version_type,
+ req->version_cb_priv);
+
+ if (!req->msg)
+ return 0;
+
nest = nla_nest_start_noflag(req->msg, attr);
if (!nest)
return -EMSGSIZE;
@@ -4116,7 +6623,8 @@ int devlink_info_version_fixed_put(struct devlink_info_req *req,
const char *version_value)
{
return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED,
- version_name, version_value);
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
}
EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put);
@@ -4125,25 +6633,49 @@ int devlink_info_version_stored_put(struct devlink_info_req *req,
const char *version_value)
{
return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
- version_name, version_value);
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
}
EXPORT_SYMBOL_GPL(devlink_info_version_stored_put);
+int devlink_info_version_stored_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext);
+
int devlink_info_version_running_put(struct devlink_info_req *req,
const char *version_name,
const char *version_value)
{
return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
- version_name, version_value);
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
}
EXPORT_SYMBOL_GPL(devlink_info_version_running_put);
+int devlink_info_version_running_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext);
+
static int
devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
enum devlink_command cmd, u32 portid,
u32 seq, int flags, struct netlink_ext_ack *extack)
{
- struct devlink_info_req req;
+ struct devlink_info_req req = {};
void *hdr;
int err;
@@ -4198,34 +6730,30 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
{
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- if (idx < start) {
- idx++;
- continue;
- }
-
- if (!devlink->ops->info_get) {
- idx++;
- continue;
- }
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ if (idx < start || !devlink->ops->info_get)
+ goto inc;
- mutex_lock(&devlink->lock);
+ devl_lock(devlink);
err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->extack);
- mutex_unlock(&devlink->lock);
- if (err && err != -EOPNOTSUPP)
+ devl_unlock(devlink);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ else if (err) {
+ devlink_put(devlink);
break;
+ }
+inc:
idx++;
+ devlink_put(devlink);
}
- mutex_unlock(&devlink_mutex);
if (err != -EMSGSIZE)
return err;
@@ -4239,11 +6767,17 @@ struct devlink_fmsg_item {
int attrtype;
u8 nla_type;
u16 len;
- int value[0];
+ int value[];
};
struct devlink_fmsg {
struct list_head item_list;
+ bool putting_binary; /* This flag forces enclosing of binary data
+ * in an array brackets. It forces using
+ * of designated API:
+ * devlink_fmsg_binary_pair_nest_start()
+ * devlink_fmsg_binary_pair_nest_end()
+ */
};
static struct devlink_fmsg *devlink_fmsg_alloc(void)
@@ -4287,17 +6821,26 @@ static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg,
int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start);
static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END);
}
int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_end(fmsg);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end);
@@ -4308,6 +6851,9 @@ static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
{
struct devlink_fmsg_item *item;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE)
return -EMSGSIZE;
@@ -4328,6 +6874,9 @@ int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
{
int err;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START);
if (err)
return err;
@@ -4342,6 +6891,9 @@ EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start);
int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_end(fmsg);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end);
@@ -4351,6 +6903,9 @@ int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
{
int err;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
err = devlink_fmsg_pair_nest_start(fmsg, name);
if (err)
return err;
@@ -4367,6 +6922,9 @@ int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
{
int err;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
err = devlink_fmsg_nest_end(fmsg);
if (err)
return err;
@@ -4379,6 +6937,30 @@ int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
}
EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end);
+int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
+ const char *name)
+{
+ int err;
+
+ err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
+ if (err)
+ return err;
+
+ fmsg->putting_binary = true;
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start);
+
+int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+ if (!fmsg->putting_binary)
+ return -EINVAL;
+
+ fmsg->putting_binary = false;
+ return devlink_fmsg_arr_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end);
+
static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
const void *value, u16 value_len,
u8 value_nla_type)
@@ -4401,42 +6983,58 @@ static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
return 0;
}
-int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
+static int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG);
}
-EXPORT_SYMBOL_GPL(devlink_fmsg_bool_put);
-int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
+static int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8);
}
-EXPORT_SYMBOL_GPL(devlink_fmsg_u8_put);
int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put);
-int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
+static int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64);
}
-EXPORT_SYMBOL_GPL(devlink_fmsg_u64_put);
int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1,
NLA_NUL_STRING);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
-static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
- u16 value_len)
+int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+ u16 value_len)
{
+ if (!fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
bool value)
@@ -4547,10 +7145,11 @@ int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
const void *value, u32 value_len)
{
u32 data_size;
+ int end_err;
u32 offset;
int err;
- err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
+ err = devlink_fmsg_binary_pair_nest_start(fmsg, name);
if (err)
return err;
@@ -4560,14 +7159,18 @@ int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
data_size = DEVLINK_FMSG_MAX_SIZE;
err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
if (err)
- return err;
+ break;
+ /* Exit from loop with a break (instead of
+ * return) to make sure putting_binary is turned off in
+ * devlink_fmsg_binary_pair_nest_end
+ */
}
- err = devlink_fmsg_arr_pair_nest_end(fmsg);
- if (err)
- return err;
+ end_err = devlink_fmsg_binary_pair_nest_end(fmsg);
+ if (end_err)
+ err = end_err;
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put);
@@ -4754,10 +7357,12 @@ struct devlink_health_reporter {
void *priv;
const struct devlink_health_reporter_ops *ops;
struct devlink *devlink;
+ struct devlink_port *devlink_port;
struct devlink_fmsg *dump_fmsg;
struct mutex dump_lock; /* lock parallel read/write from dump buffers */
u64 graceful_period;
bool auto_recover;
+ bool auto_dump;
u8 health_state;
u64 dump_ts;
u64 dump_real_ts;
@@ -4775,32 +7380,110 @@ devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
EXPORT_SYMBOL_GPL(devlink_health_reporter_priv);
static struct devlink_health_reporter *
-devlink_health_reporter_find_by_name(struct devlink *devlink,
- const char *reporter_name)
+__devlink_health_reporter_find_by_name(struct list_head *reporter_list,
+ struct mutex *list_lock,
+ const char *reporter_name)
{
struct devlink_health_reporter *reporter;
- lockdep_assert_held(&devlink->reporters_lock);
- list_for_each_entry(reporter, &devlink->reporter_list, list)
+ lockdep_assert_held(list_lock);
+ list_for_each_entry(reporter, reporter_list, list)
if (!strcmp(reporter->ops->name, reporter_name))
return reporter;
return NULL;
}
+static struct devlink_health_reporter *
+devlink_health_reporter_find_by_name(struct devlink *devlink,
+ const char *reporter_name)
+{
+ return __devlink_health_reporter_find_by_name(&devlink->reporter_list,
+ &devlink->reporters_lock,
+ reporter_name);
+}
+
+static struct devlink_health_reporter *
+devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port,
+ const char *reporter_name)
+{
+ return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list,
+ &devlink_port->reporters_lock,
+ reporter_name);
+}
+
+static struct devlink_health_reporter *
+__devlink_health_reporter_create(struct devlink *devlink,
+ const struct devlink_health_reporter_ops *ops,
+ u64 graceful_period, void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ if (WARN_ON(graceful_period && !ops->recover))
+ return ERR_PTR(-EINVAL);
+
+ reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
+ if (!reporter)
+ return ERR_PTR(-ENOMEM);
+
+ reporter->priv = priv;
+ reporter->ops = ops;
+ reporter->devlink = devlink;
+ reporter->graceful_period = graceful_period;
+ reporter->auto_recover = !!ops->recover;
+ reporter->auto_dump = !!ops->dump;
+ mutex_init(&reporter->dump_lock);
+ refcount_set(&reporter->refcount, 1);
+ return reporter;
+}
+
+/**
+ * devlink_port_health_reporter_create - create devlink health reporter for
+ * specified port instance
+ *
+ * @port: devlink_port which should contain the new reporter
+ * @ops: ops
+ * @graceful_period: to avoid recovery loops, in msecs
+ * @priv: priv
+ */
+struct devlink_health_reporter *
+devlink_port_health_reporter_create(struct devlink_port *port,
+ const struct devlink_health_reporter_ops *ops,
+ u64 graceful_period, void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ mutex_lock(&port->reporters_lock);
+ if (__devlink_health_reporter_find_by_name(&port->reporter_list,
+ &port->reporters_lock, ops->name)) {
+ reporter = ERR_PTR(-EEXIST);
+ goto unlock;
+ }
+
+ reporter = __devlink_health_reporter_create(port->devlink, ops,
+ graceful_period, priv);
+ if (IS_ERR(reporter))
+ goto unlock;
+
+ reporter->devlink_port = port;
+ list_add_tail(&reporter->list, &port->reporter_list);
+unlock:
+ mutex_unlock(&port->reporters_lock);
+ return reporter;
+}
+EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create);
+
/**
* devlink_health_reporter_create - create devlink health reporter
*
* @devlink: devlink
* @ops: ops
* @graceful_period: to avoid recovery loops, in msecs
- * @auto_recover: auto recover when error occurs
* @priv: priv
*/
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, bool auto_recover,
- void *priv)
+ u64 graceful_period, void *priv)
{
struct devlink_health_reporter *reporter;
@@ -4810,25 +7493,11 @@ devlink_health_reporter_create(struct devlink *devlink,
goto unlock;
}
- if (WARN_ON(auto_recover && !ops->recover) ||
- WARN_ON(graceful_period && !ops->recover)) {
- reporter = ERR_PTR(-EINVAL);
- goto unlock;
- }
-
- reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
- if (!reporter) {
- reporter = ERR_PTR(-ENOMEM);
+ reporter = __devlink_health_reporter_create(devlink, ops,
+ graceful_period, priv);
+ if (IS_ERR(reporter))
goto unlock;
- }
- reporter->priv = priv;
- reporter->ops = ops;
- reporter->devlink = devlink;
- reporter->graceful_period = graceful_period;
- reporter->auto_recover = auto_recover;
- mutex_init(&reporter->dump_lock);
- refcount_set(&reporter->refcount, 1);
list_add_tail(&reporter->list, &devlink->reporter_list);
unlock:
mutex_unlock(&devlink->reporters_lock);
@@ -4836,6 +7505,29 @@ unlock:
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_create);
+static void
+devlink_health_reporter_free(struct devlink_health_reporter *reporter)
+{
+ mutex_destroy(&reporter->dump_lock);
+ if (reporter->dump_fmsg)
+ devlink_fmsg_free(reporter->dump_fmsg);
+ kfree(reporter);
+}
+
+static void
+devlink_health_reporter_put(struct devlink_health_reporter *reporter)
+{
+ if (refcount_dec_and_test(&reporter->refcount))
+ devlink_health_reporter_free(reporter);
+}
+
+static void
+__devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
+{
+ list_del(&reporter->list);
+ devlink_health_reporter_put(reporter);
+}
+
/**
* devlink_health_reporter_destroy - destroy devlink health reporter
*
@@ -4844,25 +7536,37 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_create);
void
devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
{
- mutex_lock(&reporter->devlink->reporters_lock);
- list_del(&reporter->list);
- mutex_unlock(&reporter->devlink->reporters_lock);
- while (refcount_read(&reporter->refcount) > 1)
- msleep(100);
- mutex_destroy(&reporter->dump_lock);
- if (reporter->dump_fmsg)
- devlink_fmsg_free(reporter->dump_fmsg);
- kfree(reporter);
+ struct mutex *lock = &reporter->devlink->reporters_lock;
+
+ mutex_lock(lock);
+ __devlink_health_reporter_destroy(reporter);
+ mutex_unlock(lock);
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy);
+/**
+ * devlink_port_health_reporter_destroy - destroy devlink port health reporter
+ *
+ * @reporter: devlink health reporter to destroy
+ */
+void
+devlink_port_health_reporter_destroy(struct devlink_health_reporter *reporter)
+{
+ struct mutex *lock = &reporter->devlink_port->reporters_lock;
+
+ mutex_lock(lock);
+ __devlink_health_reporter_destroy(reporter);
+ mutex_unlock(lock);
+}
+EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy);
+
static int
devlink_nl_health_reporter_fill(struct sk_buff *msg,
- struct devlink *devlink,
struct devlink_health_reporter *reporter,
enum devlink_command cmd, u32 portid,
u32 seq, int flags)
{
+ struct devlink *devlink = reporter->devlink;
struct nlattr *reporter_attr;
void *hdr;
@@ -4873,6 +7577,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
if (devlink_nl_put_handle(msg, devlink))
goto genlmsg_cancel;
+ if (reporter->devlink_port) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index))
+ goto genlmsg_cancel;
+ }
reporter_attr = nla_nest_start_noflag(msg,
DEVLINK_ATTR_HEALTH_REPORTER);
if (!reporter_attr)
@@ -4907,6 +7615,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS,
reporter->dump_real_ts, DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
+ if (reporter->ops->dump &&
+ nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+ reporter->auto_dump))
+ goto reporter_nest_cancel;
nla_nest_end(msg, reporter_attr);
genlmsg_end(msg, hdr);
@@ -4922,25 +7634,25 @@ genlmsg_cancel:
static void devlink_recover_notify(struct devlink_health_reporter *reporter,
enum devlink_command cmd)
{
+ struct devlink *devlink = reporter->devlink;
struct sk_buff *msg;
int err;
WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+ WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED));
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
- err = devlink_nl_health_reporter_fill(msg, reporter->devlink,
- reporter, cmd, 0, 0, 0);
+ err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0);
if (err) {
nlmsg_free(msg);
return;
}
- genlmsg_multicast_netns(&devlink_nl_family,
- devlink_net(reporter->devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg,
+ 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
void
@@ -5029,6 +7741,8 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
{
enum devlink_health_reporter_state prev_health_state;
struct devlink *devlink = reporter->devlink;
+ unsigned long recover_ts_threshold;
+ int ret;
/* write a log message of the current error */
WARN_ON(!msg);
@@ -5039,10 +7753,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
/* abort if the previous error wasn't recovered */
+ recover_ts_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->graceful_period);
if (reporter->auto_recover &&
(prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY ||
- jiffies - reporter->last_recovery_ts <
- msecs_to_jiffies(reporter->graceful_period))) {
+ (reporter->last_recovery_ts && reporter->recovery_count &&
+ time_is_after_jiffies(recover_ts_threshold)))) {
trace_devlink_health_recover_aborted(devlink,
reporter->ops->name,
reporter->health_state,
@@ -5053,16 +7769,21 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
- mutex_lock(&reporter->dump_lock);
- /* store current dump of current error, for later analysis */
- devlink_health_do_dump(reporter, priv_ctx, NULL);
- mutex_unlock(&reporter->dump_lock);
+ if (reporter->auto_dump) {
+ mutex_lock(&reporter->dump_lock);
+ /* store current dump of current error, for later analysis */
+ devlink_health_do_dump(reporter, priv_ctx, NULL);
+ mutex_unlock(&reporter->dump_lock);
+ }
+
+ if (!reporter->auto_recover)
+ return 0;
- if (reporter->auto_recover)
- return devlink_health_reporter_recover(reporter,
- priv_ctx, NULL);
+ devl_lock(devlink);
+ ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL);
+ devl_unlock(devlink);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(devlink_health_report);
@@ -5071,17 +7792,28 @@ devlink_health_reporter_get_from_attrs(struct devlink *devlink,
struct nlattr **attrs)
{
struct devlink_health_reporter *reporter;
+ struct devlink_port *devlink_port;
char *reporter_name;
if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
return NULL;
reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
- mutex_lock(&devlink->reporters_lock);
- reporter = devlink_health_reporter_find_by_name(devlink, reporter_name);
- if (reporter)
- refcount_inc(&reporter->refcount);
- mutex_unlock(&devlink->reporters_lock);
+ devlink_port = devlink_port_get_from_attrs(devlink, attrs);
+ if (IS_ERR(devlink_port)) {
+ mutex_lock(&devlink->reporters_lock);
+ reporter = devlink_health_reporter_find_by_name(devlink, reporter_name);
+ if (reporter)
+ refcount_inc(&reporter->refcount);
+ mutex_unlock(&devlink->reporters_lock);
+ } else {
+ mutex_lock(&devlink_port->reporters_lock);
+ reporter = devlink_port_health_reporter_find_by_name(devlink_port, reporter_name);
+ if (reporter)
+ refcount_inc(&reporter->refcount);
+ mutex_unlock(&devlink_port->reporters_lock);
+ }
+
return reporter;
}
@@ -5100,23 +7832,13 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
struct nlattr **attrs = info->attrs;
struct devlink *devlink;
- mutex_lock(&devlink_mutex);
devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
if (IS_ERR(devlink))
- goto unlock;
+ return NULL;
reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
- mutex_unlock(&devlink_mutex);
+ devlink_put(devlink);
return reporter;
-unlock:
- mutex_unlock(&devlink_mutex);
- return NULL;
-}
-
-static void
-devlink_health_reporter_put(struct devlink_health_reporter *reporter)
-{
- refcount_dec(&reporter->refcount);
}
void
@@ -5155,7 +7877,7 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb,
goto out;
}
- err = devlink_nl_health_reporter_fill(msg, devlink, reporter,
+ err = devlink_nl_health_reporter_fill(msg, reporter,
DEVLINK_CMD_HEALTH_REPORTER_GET,
info->snd_portid, info->snd_seq,
0);
@@ -5175,15 +7897,14 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
struct netlink_callback *cb)
{
struct devlink_health_reporter *reporter;
+ struct devlink_port *port;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
mutex_lock(&devlink->reporters_lock);
list_for_each_entry(reporter, &devlink->reporter_list,
list) {
@@ -5191,23 +7912,49 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
idx++;
continue;
}
- err = devlink_nl_health_reporter_fill(msg, devlink,
- reporter,
- DEVLINK_CMD_HEALTH_REPORTER_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
+ err = devlink_nl_health_reporter_fill(
+ msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET,
+ NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
if (err) {
mutex_unlock(&devlink->reporters_lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
mutex_unlock(&devlink->reporters_lock);
+ devlink_put(devlink);
}
-out:
- mutex_unlock(&devlink_mutex);
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ devl_lock(devlink);
+ list_for_each_entry(port, &devlink->port_list, list) {
+ mutex_lock(&port->reporters_lock);
+ list_for_each_entry(reporter, &port->reporter_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_health_reporter_fill(
+ msg, reporter,
+ DEVLINK_CMD_HEALTH_REPORTER_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&port->reporters_lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&port->reporters_lock);
+ }
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ }
+out:
cb->args[0] = idx;
return msg->len;
}
@@ -5230,6 +7977,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
err = -EOPNOTSUPP;
goto out;
}
+ if (!reporter->ops->dump &&
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
reporter->graceful_period =
@@ -5239,6 +7991,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
reporter->auto_recover =
nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
+ reporter->auto_dump =
+ nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]);
+
devlink_health_reporter_put(reporter);
return 0;
out:
@@ -5368,25 +8124,64 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
return 0;
}
+static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+ int err;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->test) {
+ devlink_health_reporter_put(reporter);
+ return -EOPNOTSUPP;
+ }
+
+ err = reporter->ops->test(reporter, info->extack);
+
+ devlink_health_reporter_put(reporter);
+ return err;
+}
+
struct devlink_stats {
- u64 rx_bytes;
- u64 rx_packets;
+ u64_stats_t rx_bytes;
+ u64_stats_t rx_packets;
struct u64_stats_sync syncp;
};
/**
+ * struct devlink_trap_policer_item - Packet trap policer attributes.
+ * @policer: Immutable packet trap policer attributes.
+ * @rate: Rate in packets / sec.
+ * @burst: Burst size in packets.
+ * @list: trap_policer_list member.
+ *
+ * Describes packet trap policer attributes. Created by devlink during trap
+ * policer registration.
+ */
+struct devlink_trap_policer_item {
+ const struct devlink_trap_policer *policer;
+ u64 rate;
+ u64 burst;
+ struct list_head list;
+};
+
+/**
* struct devlink_trap_group_item - Packet trap group attributes.
* @group: Immutable packet trap group attributes.
- * @refcount: Number of trap items using the group.
+ * @policer_item: Associated policer item. Can be NULL.
* @list: trap_group_list member.
* @stats: Trap group statistics.
*
* Describes packet trap group attributes. Created by devlink during trap
- * registration.
+ * group registration.
*/
struct devlink_trap_group_item {
const struct devlink_trap_group *group;
- refcount_t refcount;
+ struct devlink_trap_policer_item *policer_item;
struct list_head list;
struct devlink_stats __percpu *stats;
};
@@ -5412,6 +8207,19 @@ struct devlink_trap_item {
void *priv;
};
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
+ if (policer_item->policer->id == id)
+ return policer_item;
+ }
+
+ return NULL;
+}
+
static struct devlink_trap_item *
devlink_trap_item_lookup(struct devlink *devlink, const char *name)
{
@@ -5446,8 +8254,9 @@ devlink_trap_action_get_from_info(struct genl_info *info,
val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]);
switch (val) {
- case DEVLINK_TRAP_ACTION_DROP: /* fall-through */
+ case DEVLINK_TRAP_ACTION_DROP:
case DEVLINK_TRAP_ACTION_TRAP:
+ case DEVLINK_TRAP_ACTION_MIRROR:
*p_trap_action = val;
break;
default:
@@ -5469,6 +8278,9 @@ static int devlink_trap_metadata_put(struct sk_buff *msg,
if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) &&
nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT))
goto nla_put_failure;
+ if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE))
+ goto nla_put_failure;
nla_nest_end(msg, attr);
@@ -5493,17 +8305,18 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
cpu_stats = per_cpu_ptr(trap_stats, i);
do {
start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
- rx_packets = cpu_stats->rx_packets;
- rx_bytes = cpu_stats->rx_bytes;
+ rx_packets = u64_stats_read(&cpu_stats->rx_packets);
+ rx_bytes = u64_stats_read(&cpu_stats->rx_bytes);
} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
- stats->rx_packets += rx_packets;
- stats->rx_bytes += rx_bytes;
+ u64_stats_add(&stats->rx_packets, rx_packets);
+ u64_stats_add(&stats->rx_bytes, rx_bytes);
}
}
-static int devlink_trap_stats_put(struct sk_buff *msg,
- struct devlink_stats __percpu *trap_stats)
+static int
+devlink_trap_group_stats_put(struct sk_buff *msg,
+ struct devlink_stats __percpu *trap_stats)
{
struct devlink_stats stats;
struct nlattr *attr;
@@ -5515,11 +8328,59 @@ static int devlink_trap_stats_put(struct sk_buff *msg,
return -EMSGSIZE;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
- stats.rx_packets, DEVLINK_ATTR_PAD))
+ u64_stats_read(&stats.rx_packets),
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES,
+ u64_stats_read(&stats.rx_bytes),
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_item *trap_item)
+{
+ struct devlink_stats stats;
+ struct nlattr *attr;
+ u64 drops = 0;
+ int err;
+
+ if (devlink->ops->trap_drop_counter_get) {
+ err = devlink->ops->trap_drop_counter_get(devlink,
+ trap_item->trap,
+ &drops);
+ if (err)
+ return err;
+ }
+
+ devlink_trap_stats_read(trap_item->stats, &stats);
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (devlink->ops->trap_drop_counter_get &&
+ nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops,
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
+ u64_stats_read(&stats.rx_packets),
+ DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES,
- stats.rx_bytes, DEVLINK_ATTR_PAD))
+ u64_stats_read(&stats.rx_bytes),
+ DEVLINK_ATTR_PAD))
goto nla_put_failure;
nla_nest_end(msg, attr);
@@ -5568,7 +8429,7 @@ static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
if (err)
goto nla_put_failure;
- err = devlink_trap_stats_put(msg, trap_item->stats);
+ err = devlink_trap_stats_put(msg, devlink, trap_item);
if (err)
goto nla_put_failure;
@@ -5622,14 +8483,12 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg,
struct devlink_trap_item *trap_item;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- mutex_lock(&devlink->lock);
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ devl_lock(devlink);
list_for_each_entry(trap_item, &devlink->trap_list, list) {
if (idx < start) {
idx++;
@@ -5641,16 +8500,16 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err) {
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
goto out;
}
idx++;
}
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
}
out:
- mutex_unlock(&devlink_mutex);
-
cb->args[0] = idx;
return msg->len;
}
@@ -5669,7 +8528,7 @@ static int __devlink_trap_action_set(struct devlink *devlink,
}
err = devlink->ops->trap_action_set(devlink, trap_item->trap,
- trap_action);
+ trap_action, extack);
if (err)
return err;
@@ -5704,7 +8563,6 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb,
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_item *trap_item;
- int err;
if (list_empty(&devlink->trap_list))
return -EOPNOTSUPP;
@@ -5715,11 +8573,7 @@ static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb,
return -ENOENT;
}
- err = devlink_trap_action_set(devlink, trap_item, info);
- if (err)
- return err;
-
- return 0;
+ return devlink_trap_action_set(devlink, trap_item, info);
}
static struct devlink_trap_group_item *
@@ -5736,6 +8590,19 @@ devlink_trap_group_item_lookup(struct devlink *devlink, const char *name)
}
static struct devlink_trap_group_item *
+devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (group_item->group->id == id)
+ return group_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_group_item *
devlink_trap_group_item_get_from_info(struct devlink *devlink,
struct genl_info *info)
{
@@ -5772,7 +8639,12 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
goto nla_put_failure;
- err = devlink_trap_stats_put(msg, group_item->stats);
+ if (group_item->policer_item &&
+ nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ group_item->policer_item->policer->id))
+ goto nla_put_failure;
+
+ err = devlink_trap_group_stats_put(msg, group_item->stats);
if (err)
goto nla_put_failure;
@@ -5828,14 +8700,12 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg,
u32 portid = NETLINK_CB(cb->skb).portid;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- mutex_lock(&devlink->lock);
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ devl_lock(devlink);
list_for_each_entry(group_item, &devlink->trap_group_list,
list) {
if (idx < start) {
@@ -5848,16 +8718,16 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err) {
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
goto out;
}
idx++;
}
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
+ devlink_put(devlink);
}
out:
- mutex_unlock(&devlink_mutex);
-
cb->args[0] = idx;
return msg->len;
}
@@ -5872,8 +8742,26 @@ __devlink_trap_group_action_set(struct devlink *devlink,
struct devlink_trap_item *trap_item;
int err;
+ if (devlink->ops->trap_group_action_set) {
+ err = devlink->ops->trap_group_action_set(devlink, group_item->group,
+ trap_action, extack);
+ if (err)
+ return err;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (strcmp(trap_item->group_item->group->name, group_name))
+ continue;
+ if (trap_item->action != trap_action &&
+ trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP)
+ continue;
+ trap_item->action = trap_action;
+ }
+
+ return 0;
+ }
+
list_for_each_entry(trap_item, &devlink->trap_list, list) {
- if (strcmp(trap_item->trap->group.name, group_name))
+ if (strcmp(trap_item->group_item->group->name, group_name))
continue;
err = __devlink_trap_action_set(devlink, trap_item,
trap_action, extack);
@@ -5887,7 +8775,7 @@ __devlink_trap_group_action_set(struct devlink *devlink,
static int
devlink_trap_group_action_set(struct devlink *devlink,
struct devlink_trap_group_item *group_item,
- struct genl_info *info)
+ struct genl_info *info, bool *p_modified)
{
enum devlink_trap_action trap_action;
int err;
@@ -5906,6 +8794,48 @@ devlink_trap_group_action_set(struct devlink *devlink,
if (err)
return err;
+ *p_modified = true;
+
+ return 0;
+}
+
+static int devlink_trap_group_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ const struct devlink_trap_policer *policer;
+ struct nlattr **attrs = info->attrs;
+ int err;
+
+ if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return 0;
+
+ if (!devlink->ops->trap_group_set)
+ return -EOPNOTSUPP;
+
+ policer_item = group_item->policer_item;
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) {
+ u32 policer_id;
+
+ policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+ policer_item = devlink_trap_policer_item_lookup(devlink,
+ policer_id);
+ if (policer_id && !policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+ }
+ policer = policer_item ? policer_item->policer : NULL;
+
+ err = devlink->ops->trap_group_set(devlink, group_item->group, policer,
+ extack);
+ if (err)
+ return err;
+
+ group_item->policer_item = policer_item;
+
return 0;
}
@@ -5915,6 +8845,7 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_group_item *group_item;
+ bool modified = false;
int err;
if (list_empty(&devlink->trap_group_list))
@@ -5926,18 +8857,265 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
return -ENOENT;
}
- err = devlink_trap_group_action_set(devlink, group_item, info);
+ err = devlink_trap_group_action_set(devlink, group_item, info,
+ &modified);
+ if (err)
+ return err;
+
+ err = devlink_trap_group_set(devlink, group_item, info);
+ if (err)
+ goto err_trap_group_set;
+
+ return 0;
+
+err_trap_group_set:
+ if (modified)
+ NL_SET_ERR_MSG_MOD(extack, "Trap group set failed, but some changes were committed already");
+ return err;
+}
+
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ u32 id;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return NULL;
+ id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+
+ return devlink_trap_policer_item_lookup(devlink, id);
+}
+
+static int
+devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct nlattr *attr;
+ u64 drops;
+ int err;
+
+ if (!devlink->ops->trap_policer_counter_get)
+ return 0;
+
+ err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops);
+ if (err)
+ return err;
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops,
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int
+devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ policer_item->policer->id))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE,
+ policer_item->rate, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST,
+ policer_item->burst, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ err = devlink_trap_policer_stats_put(msg, devlink,
+ policer_item->policer);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err)
+ goto err_trap_policer_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_policer_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW;
+ struct devlink_trap_policer_item *policer_item;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ unsigned long index;
+ int idx = 0;
+ int err;
+
+ devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
+ devl_lock(devlink);
+ list_for_each_entry(policer_item, &devlink->trap_policer_list,
+ list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_policer_fill(msg, devlink,
+ policer_item, cmd,
+ portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ goto out;
+ }
+ idx++;
+ }
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ }
+out:
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int
+devlink_trap_policer_set(struct devlink *devlink,
+ struct devlink_trap_policer_item *policer_item,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nlattr **attrs = info->attrs;
+ u64 rate, burst;
+ int err;
+
+ rate = policer_item->rate;
+ burst = policer_item->burst;
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE])
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]);
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST])
+ burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]);
+
+ if (rate < policer_item->policer->min_rate) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer rate lower than limit");
+ return -EINVAL;
+ }
+
+ if (rate > policer_item->policer->max_rate) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer rate higher than limit");
+ return -EINVAL;
+ }
+
+ if (burst < policer_item->policer->min_burst) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer burst size lower than limit");
+ return -EINVAL;
+ }
+
+ if (burst > policer_item->policer->max_burst) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer burst size higher than limit");
+ return -EINVAL;
+ }
+
+ err = devlink->ops->trap_policer_set(devlink, policer_item->policer,
+ rate, burst, info->extack);
if (err)
return err;
+ policer_item->rate = rate;
+ policer_item->burst = burst;
+
return 0;
}
+static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ if (!devlink->ops->trap_policer_set)
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ return devlink_trap_policer_set(devlink, policer_item, info);
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+ [DEVLINK_ATTR_UNSPEC] = { .strict_start_type =
+ DEVLINK_ATTR_TRAP_POLICER_ID },
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
- [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_PORT_TYPE_AUTO,
+ DEVLINK_PORT_TYPE_IB),
[DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 },
@@ -5946,7 +9124,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
- [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_RANGE(NLA_U16, DEVLINK_ESWITCH_MODE_LEGACY,
+ DEVLINK_ESWITCH_MODE_SWITCHDEV),
[DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
[DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 },
[DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING },
@@ -5965,21 +9144,42 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] =
+ NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS),
[DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 },
[DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 },
[DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 },
[DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED },
+ [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ DEVLINK_RELOAD_ACTION_MAX),
+ [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK),
+ [DEVLINK_ATTR_PORT_FLAVOUR] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_RATE_TYPE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED },
};
-static const struct genl_ops devlink_nl_ops[] = {
+static const struct genl_small_ops devlink_nl_ops[] = {
{
.cmd = DEVLINK_CMD_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_get_doit,
.dumpit = devlink_nl_cmd_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
@@ -5998,28 +9198,71 @@ static const struct genl_ops devlink_nl_ops[] = {
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
+ .cmd = DEVLINK_CMD_RATE_GET,
+ .doit = devlink_nl_cmd_rate_get_doit,
+ .dumpit = devlink_nl_cmd_rate_get_dumpit,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_RATE,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_SET,
+ .doit = devlink_nl_cmd_rate_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_RATE,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_NEW,
+ .doit = devlink_nl_cmd_rate_new_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_DEL,
+ .doit = devlink_nl_cmd_rate_del_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE,
+ },
+ {
.cmd = DEVLINK_CMD_PORT_SPLIT,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_port_split_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
.cmd = DEVLINK_CMD_PORT_UNSPLIT,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_port_unsplit_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_NEW,
+ .doit = devlink_nl_cmd_port_new_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_DEL,
+ .doit = devlink_nl_cmd_port_del_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = DEVLINK_CMD_LINECARD_GET,
+ .doit = devlink_nl_cmd_linecard_get_doit,
+ .dumpit = devlink_nl_cmd_linecard_get_dumpit,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_LINECARD_SET,
+ .doit = devlink_nl_cmd_linecard_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
},
{
.cmd = DEVLINK_CMD_SB_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_get_doit,
.dumpit = devlink_nl_cmd_sb_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
/* can be retrieved by unprivileged users */
},
{
@@ -6027,8 +9270,6 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_pool_get_doit,
.dumpit = devlink_nl_cmd_sb_pool_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
/* can be retrieved by unprivileged users */
},
{
@@ -6036,16 +9277,13 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_pool_set_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
},
{
.cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_port_pool_get_doit,
.dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
- DEVLINK_NL_FLAG_NEED_SB,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
/* can be retrieved by unprivileged users */
},
{
@@ -6053,16 +9291,14 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_port_pool_set_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
- DEVLINK_NL_FLAG_NEED_SB,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
.cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit,
.dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
- DEVLINK_NL_FLAG_NEED_SB,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
/* can be retrieved by unprivileged users */
},
{
@@ -6070,59 +9306,48 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
- DEVLINK_NL_FLAG_NEED_SB,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
.cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_occ_snapshot_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
},
{
.cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_occ_max_clear_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
},
{
.cmd = DEVLINK_CMD_ESWITCH_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_eswitch_get_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
.cmd = DEVLINK_CMD_ESWITCH_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_eswitch_set_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_dpipe_table_get,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_dpipe_entries_get,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_dpipe_headers_get,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
@@ -6130,20 +9355,17 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_dpipe_table_counters_set,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
.cmd = DEVLINK_CMD_RESOURCE_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_resource_set,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
.cmd = DEVLINK_CMD_RESOURCE_DUMP,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_resource_dump,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
@@ -6151,15 +9373,12 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_reload,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_PARAM_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_param_get_doit,
.dumpit = devlink_nl_cmd_param_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
@@ -6167,7 +9386,6 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_param_set_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
.cmd = DEVLINK_CMD_PORT_PARAM_GET,
@@ -6190,14 +9408,18 @@ static const struct genl_ops devlink_nl_ops[] = {
.doit = devlink_nl_cmd_region_get_doit,
.dumpit = devlink_nl_cmd_region_get_dumpit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = devlink_nl_cmd_region_new,
+ .flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_REGION_DEL,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_region_del,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
.cmd = DEVLINK_CMD_REGION_READ,
@@ -6205,14 +9427,12 @@ static const struct genl_ops devlink_nl_ops[] = {
GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = devlink_nl_cmd_region_read_dumpit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
.cmd = DEVLINK_CMD_INFO_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_info_get_doit,
.dumpit = devlink_nl_cmd_info_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
@@ -6220,8 +9440,7 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_get_doit,
.dumpit = devlink_nl_cmd_health_reporter_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
/* can be retrieved by unprivileged users */
},
{
@@ -6229,24 +9448,21 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_set_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_recover_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_diagnose_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
@@ -6254,49 +9470,70 @@ static const struct genl_ops devlink_nl_ops[] = {
GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_dump_clear_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = devlink_nl_cmd_health_reporter_test_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
},
{
.cmd = DEVLINK_CMD_FLASH_UPDATE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_flash_update,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
.cmd = DEVLINK_CMD_TRAP_GET,
.doit = devlink_nl_cmd_trap_get_doit,
.dumpit = devlink_nl_cmd_trap_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_TRAP_SET,
.doit = devlink_nl_cmd_trap_set_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
.cmd = DEVLINK_CMD_TRAP_GROUP_GET,
.doit = devlink_nl_cmd_trap_group_get_doit,
.dumpit = devlink_nl_cmd_trap_group_get_dumpit,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_TRAP_GROUP_SET,
.doit = devlink_nl_cmd_trap_group_set_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_GET,
+ .doit = devlink_nl_cmd_trap_policer_get_doit,
+ .dumpit = devlink_nl_cmd_trap_policer_get_dumpit,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_SET,
+ .doit = devlink_nl_cmd_trap_policer_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = DEVLINK_CMD_SELFTESTS_GET,
+ .doit = devlink_nl_cmd_selftests_get_doit,
+ .dumpit = devlink_nl_cmd_selftests_get_dumpit
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_SELFTESTS_RUN,
+ .doit = devlink_nl_cmd_selftests_run,
+ .flags = GENL_ADMIN_PERM,
},
};
@@ -6306,37 +9543,107 @@ static struct genl_family devlink_nl_family __ro_after_init = {
.maxattr = DEVLINK_ATTR_MAX,
.policy = devlink_nl_policy,
.netnsok = true,
+ .parallel_ops = true,
.pre_doit = devlink_nl_pre_doit,
.post_doit = devlink_nl_post_doit,
.module = THIS_MODULE,
- .ops = devlink_nl_ops,
- .n_ops = ARRAY_SIZE(devlink_nl_ops),
+ .small_ops = devlink_nl_ops,
+ .n_small_ops = ARRAY_SIZE(devlink_nl_ops),
+ .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1,
.mcgrps = devlink_nl_mcgrps,
.n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
};
+static bool devlink_reload_actions_valid(const struct devlink_ops *ops)
+{
+ const struct devlink_reload_combination *comb;
+ int i;
+
+ if (!devlink_reload_supported(ops)) {
+ if (WARN_ON(ops->reload_actions))
+ return false;
+ return true;
+ }
+
+ if (WARN_ON(!ops->reload_actions ||
+ ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX)))
+ return false;
+
+ if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) ||
+ ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX)))
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) {
+ comb = &devlink_reload_invalid_combinations[i];
+ if (ops->reload_actions == BIT(comb->action) &&
+ ops->reload_limits == BIT(comb->limit))
+ return false;
+ }
+ return true;
+}
+
/**
- * devlink_alloc - Allocate new devlink instance resources
+ * devlink_set_features - Set devlink supported features
+ *
+ * @devlink: devlink
+ * @features: devlink support features
+ *
+ * This interface allows us to set reload ops separatelly from
+ * the devlink_alloc.
+ */
+void devlink_set_features(struct devlink *devlink, u64 features)
+{
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+
+ WARN_ON(features & DEVLINK_F_RELOAD &&
+ !devlink_reload_supported(devlink->ops));
+ devlink->features = features;
+}
+EXPORT_SYMBOL_GPL(devlink_set_features);
+
+/**
+ * devlink_alloc_ns - Allocate new devlink instance resources
+ * in specific namespace
*
* @ops: ops
* @priv_size: size of user private data
+ * @net: net namespace
+ * @dev: parent device
*
* Allocate new devlink instance resources, including devlink index
* and name.
*/
-struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
+ size_t priv_size, struct net *net,
+ struct device *dev)
{
struct devlink *devlink;
+ static u32 last_id;
+ int ret;
- if (WARN_ON(!ops))
+ WARN_ON(!ops || !dev);
+ if (!devlink_reload_actions_valid(ops))
return NULL;
devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
if (!devlink)
return NULL;
+
+ ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
+ &last_id, GFP_KERNEL);
+ if (ret < 0) {
+ kfree(devlink);
+ return NULL;
+ }
+
+ devlink->dev = dev;
devlink->ops = ops;
- __devlink_net_set(devlink, &init_net);
+ xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
+ write_pnet(&devlink->_net, net);
INIT_LIST_HEAD(&devlink->port_list);
+ INIT_LIST_HEAD(&devlink->rate_list);
+ INIT_LIST_HEAD(&devlink->linecard_list);
INIT_LIST_HEAD(&devlink->sb_list);
INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
INIT_LIST_HEAD(&devlink->resource_list);
@@ -6345,80 +9652,141 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
INIT_LIST_HEAD(&devlink->reporter_list);
INIT_LIST_HEAD(&devlink->trap_list);
INIT_LIST_HEAD(&devlink->trap_group_list);
+ INIT_LIST_HEAD(&devlink->trap_policer_list);
+ lockdep_register_key(&devlink->lock_key);
mutex_init(&devlink->lock);
+ lockdep_set_class(&devlink->lock, &devlink->lock_key);
mutex_init(&devlink->reporters_lock);
+ mutex_init(&devlink->linecards_lock);
+ refcount_set(&devlink->refcount, 1);
+ init_completion(&devlink->comp);
+
return devlink;
}
-EXPORT_SYMBOL_GPL(devlink_alloc);
+EXPORT_SYMBOL_GPL(devlink_alloc_ns);
-/**
- * devlink_register - Register devlink instance
- *
- * @devlink: devlink
- * @dev: parent device
- */
-int devlink_register(struct devlink *devlink, struct device *dev)
+static void
+devlink_trap_policer_notify(struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd);
+static void
+devlink_trap_group_notify(struct devlink *devlink,
+ const struct devlink_trap_group_item *group_item,
+ enum devlink_command cmd);
+static void devlink_trap_notify(struct devlink *devlink,
+ const struct devlink_trap_item *trap_item,
+ enum devlink_command cmd);
+
+static void devlink_notify_register(struct devlink *devlink)
{
- mutex_lock(&devlink_mutex);
- devlink->dev = dev;
- devlink->registered = true;
- list_add_tail(&devlink->list, &devlink_list);
+ struct devlink_trap_policer_item *policer_item;
+ struct devlink_trap_group_item *group_item;
+ struct devlink_param_item *param_item;
+ struct devlink_trap_item *trap_item;
+ struct devlink_port *devlink_port;
+ struct devlink_linecard *linecard;
+ struct devlink_rate *rate_node;
+ struct devlink_region *region;
+
devlink_notify(devlink, DEVLINK_CMD_NEW);
- mutex_unlock(&devlink_mutex);
- return 0;
+ list_for_each_entry(linecard, &devlink->linecard_list, list)
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list)
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list)
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW);
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list)
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW);
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list)
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW);
+
+ list_for_each_entry(rate_node, &devlink->rate_list, list)
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+
+ list_for_each_entry(region, &devlink->region_list, list)
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+
+ list_for_each_entry(param_item, &devlink->param_list, list)
+ devlink_param_notify(devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_NEW);
}
-EXPORT_SYMBOL_GPL(devlink_register);
-/**
- * devlink_unregister - Unregister devlink instance
- *
- * @devlink: devlink
- */
-void devlink_unregister(struct devlink *devlink)
+static void devlink_notify_unregister(struct devlink *devlink)
{
- mutex_lock(&devlink_mutex);
- WARN_ON(devlink_reload_supported(devlink) &&
- devlink->reload_enabled);
+ struct devlink_trap_policer_item *policer_item;
+ struct devlink_trap_group_item *group_item;
+ struct devlink_param_item *param_item;
+ struct devlink_trap_item *trap_item;
+ struct devlink_port *devlink_port;
+ struct devlink_rate *rate_node;
+ struct devlink_region *region;
+
+ list_for_each_entry_reverse(param_item, &devlink->param_list, list)
+ devlink_param_notify(devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_DEL);
+
+ list_for_each_entry_reverse(region, &devlink->region_list, list)
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
+
+ list_for_each_entry_reverse(rate_node, &devlink->rate_list, list)
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
+
+ list_for_each_entry_reverse(trap_item, &devlink->trap_list, list)
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL);
+
+ list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list)
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_DEL);
+ list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list,
+ list)
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_DEL);
+
+ list_for_each_entry_reverse(devlink_port, &devlink->port_list, list)
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
devlink_notify(devlink, DEVLINK_CMD_DEL);
- list_del(&devlink->list);
- mutex_unlock(&devlink_mutex);
}
-EXPORT_SYMBOL_GPL(devlink_unregister);
/**
- * devlink_reload_enable - Enable reload of devlink instance
+ * devlink_register - Register devlink instance
*
* @devlink: devlink
- *
- * Should be called at end of device initialization
- * process when reload operation is supported.
*/
-void devlink_reload_enable(struct devlink *devlink)
+void devlink_register(struct devlink *devlink)
{
- mutex_lock(&devlink_mutex);
- devlink->reload_enabled = true;
- mutex_unlock(&devlink_mutex);
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+ /* Make sure that we are in .probe() routine */
+
+ xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+ devlink_notify_register(devlink);
}
-EXPORT_SYMBOL_GPL(devlink_reload_enable);
+EXPORT_SYMBOL_GPL(devlink_register);
/**
- * devlink_reload_disable - Disable reload of devlink instance
+ * devlink_unregister - Unregister devlink instance
*
* @devlink: devlink
- *
- * Should be called at the beginning of device cleanup
- * process when reload operation is supported.
*/
-void devlink_reload_disable(struct devlink *devlink)
+void devlink_unregister(struct devlink *devlink)
{
- mutex_lock(&devlink_mutex);
- /* Mutex is taken which ensures that no reload operation is in
- * progress while setting up forbidded flag.
- */
- devlink->reload_enabled = false;
- mutex_unlock(&devlink_mutex);
+ ASSERT_DEVLINK_REGISTERED(devlink);
+ /* Make sure that we are in .remove() routine */
+
+ xa_set_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING);
+ devlink_put(devlink);
+ wait_for_completion(&devlink->comp);
+
+ devlink_notify_unregister(devlink);
+ xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+ xa_clear_mark(&devlinks, devlink->index, DEVLINK_UNREGISTERING);
}
-EXPORT_SYMBOL_GPL(devlink_reload_disable);
+EXPORT_SYMBOL_GPL(devlink_unregister);
/**
* devlink_free - Free devlink instance resources
@@ -6427,8 +9795,13 @@ EXPORT_SYMBOL_GPL(devlink_reload_disable);
*/
void devlink_free(struct devlink *devlink)
{
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+
+ mutex_destroy(&devlink->linecards_lock);
mutex_destroy(&devlink->reporters_lock);
mutex_destroy(&devlink->lock);
+ lockdep_unregister_key(&devlink->lock_key);
+ WARN_ON(!list_empty(&devlink->trap_policer_list));
WARN_ON(!list_empty(&devlink->trap_group_list));
WARN_ON(!list_empty(&devlink->trap_list));
WARN_ON(!list_empty(&devlink->reporter_list));
@@ -6437,8 +9810,13 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->resource_list));
WARN_ON(!list_empty(&devlink->dpipe_table_list));
WARN_ON(!list_empty(&devlink->sb_list));
+ WARN_ON(!list_empty(&devlink->rate_list));
+ WARN_ON(!list_empty(&devlink->linecard_list));
WARN_ON(!list_empty(&devlink->port_list));
+ xa_destroy(&devlink->snapshot_ids);
+ xa_erase(&devlinks, devlink->index);
+
kfree(devlink);
}
EXPORT_SYMBOL_GPL(devlink_free);
@@ -6452,7 +9830,8 @@ static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
{
/* Ignore CPU and DSA flavours. */
return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
- devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA;
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED;
}
#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600)
@@ -6476,6 +9855,83 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
}
/**
+ * devlink_port_init() - Init devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ *
+ * Initialize essencial stuff that is needed for functions
+ * that may be called before devlink port registration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_init(struct devlink *devlink,
+ struct devlink_port *devlink_port)
+{
+ if (devlink_port->initialized)
+ return;
+ devlink_port->devlink = devlink;
+ INIT_LIST_HEAD(&devlink_port->region_list);
+ devlink_port->initialized = true;
+}
+EXPORT_SYMBOL_GPL(devlink_port_init);
+
+/**
+ * devlink_port_fini() - Deinitialize devlink port
+ *
+ * @devlink_port: devlink port
+ *
+ * Deinitialize essencial stuff that is in use for functions
+ * that may be called after devlink port unregistration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_fini(struct devlink_port *devlink_port)
+{
+ WARN_ON(!list_empty(&devlink_port->region_list));
+}
+EXPORT_SYMBOL_GPL(devlink_port_fini);
+
+/**
+ * devl_port_register() - Register devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ * @port_index: driver-specific numerical identifier of the port
+ *
+ * Register devlink port with provided port index. User can use
+ * any indexing, even hw-related one. devlink_port structure
+ * is convenient to be embedded inside user driver private structure.
+ * Note that the caller should take care of zeroing the devlink_port
+ * structure.
+ */
+int devl_port_register(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ unsigned int port_index)
+{
+ devl_assert_locked(devlink);
+
+ if (devlink_port_index_exists(devlink, port_index))
+ return -EEXIST;
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ devlink_port_init(devlink, devlink_port);
+ devlink_port->registered = true;
+ devlink_port->index = port_index;
+ spin_lock_init(&devlink_port->type_lock);
+ INIT_LIST_HEAD(&devlink_port->reporter_list);
+ mutex_init(&devlink_port->reporters_lock);
+ list_add_tail(&devlink_port->list, &devlink->port_list);
+
+ INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
+ devlink_port_type_warn_schedule(devlink_port);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_port_register);
+
+/**
* devlink_port_register - Register devlink port
*
* @devlink: devlink
@@ -6487,44 +9943,54 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
* is convenient to be embedded inside user driver private structure.
* Note that the caller should take care of zeroing the devlink_port
* structure.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
*/
int devlink_port_register(struct devlink *devlink,
struct devlink_port *devlink_port,
unsigned int port_index)
{
- mutex_lock(&devlink->lock);
- if (devlink_port_index_exists(devlink, port_index)) {
- mutex_unlock(&devlink->lock);
- return -EEXIST;
- }
- devlink_port->devlink = devlink;
- devlink_port->index = port_index;
- devlink_port->registered = true;
- spin_lock_init(&devlink_port->type_lock);
- list_add_tail(&devlink_port->list, &devlink->port_list);
- INIT_LIST_HEAD(&devlink_port->param_list);
- mutex_unlock(&devlink->lock);
- INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
- devlink_port_type_warn_schedule(devlink_port);
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
- return 0;
+ int err;
+
+ devl_lock(devlink);
+ err = devl_port_register(devlink, devlink_port, port_index);
+ devl_unlock(devlink);
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_port_register);
/**
+ * devl_port_unregister() - Unregister devlink port
+ *
+ * @devlink_port: devlink port
+ */
+void devl_port_unregister(struct devlink_port *devlink_port)
+{
+ lockdep_assert_held(&devlink_port->devlink->lock);
+
+ devlink_port_type_warn_cancel(devlink_port);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+ list_del(&devlink_port->list);
+ WARN_ON(!list_empty(&devlink_port->reporter_list));
+ mutex_destroy(&devlink_port->reporters_lock);
+ devlink_port->registered = false;
+}
+EXPORT_SYMBOL_GPL(devl_port_unregister);
+
+/**
* devlink_port_unregister - Unregister devlink port
*
* @devlink_port: devlink port
+ *
+ * Context: Takes and release devlink->lock <mutex>.
*/
void devlink_port_unregister(struct devlink_port *devlink_port)
{
struct devlink *devlink = devlink_port->devlink;
- devlink_port_type_warn_cancel(devlink_port);
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
- mutex_lock(&devlink->lock);
- list_del(&devlink_port->list);
- mutex_unlock(&devlink->lock);
+ devl_lock(devlink);
+ devl_port_unregister(devlink_port);
+ devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_port_unregister);
@@ -6532,8 +9998,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
enum devlink_port_type type,
void *type_dev)
{
- if (WARN_ON(!devlink_port->registered))
- return;
+ ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
devlink_port_type_warn_cancel(devlink_port);
spin_lock_bh(&devlink_port->type_lock);
devlink_port->type = type;
@@ -6542,14 +10008,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}
-/**
- * devlink_port_type_eth_set - Set port type to Ethernet
- *
- * @devlink_port: devlink port
- * @netdev: related netdevice
- */
-void devlink_port_type_eth_set(struct devlink_port *devlink_port,
- struct net_device *netdev)
+static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
+ struct net_device *netdev)
{
const struct net_device_ops *ops = netdev->netdev_ops;
@@ -6583,6 +10043,24 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port,
err = ops->ndo_get_port_parent_id(netdev, &ppid);
WARN_ON(err != -EOPNOTSUPP);
}
+}
+
+/**
+ * devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ * @devlink_port: devlink port
+ * @netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+ struct net_device *netdev)
+{
+ if (netdev)
+ devlink_port_type_netdev_checks(devlink_port, netdev);
+ else
+ dev_warn(devlink_port->devlink->dev,
+ "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
+ devlink_port->index);
+
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev);
}
EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
@@ -6613,24 +10091,18 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
EXPORT_SYMBOL_GPL(devlink_port_type_clear);
static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
- enum devlink_port_flavour flavour,
- const unsigned char *switch_id,
- unsigned char switch_id_len)
+ enum devlink_port_flavour flavour)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
- if (WARN_ON(devlink_port->registered))
- return -EEXIST;
- attrs->set = true;
+ devlink_port->attrs_set = true;
attrs->flavour = flavour;
- if (switch_id) {
- attrs->switch_port = true;
- if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN))
- switch_id_len = MAX_PHYS_ITEM_ID_LEN;
- memcpy(attrs->switch_id.id, switch_id, switch_id_len);
- attrs->switch_id.id_len = switch_id_len;
+ if (attrs->switch_id.id_len) {
+ devlink_port->switch_port = true;
+ if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN))
+ attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN;
} else {
- attrs->switch_port = false;
+ devlink_port->switch_port = false;
}
return 0;
}
@@ -6639,33 +10111,20 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
* devlink_port_attrs_set - Set port attributes
*
* @devlink_port: devlink port
- * @flavour: flavour of the port
- * @port_number: number of the port that is facing user, for example
- * the front panel port number
- * @split: indicates if this is split port
- * @split_subport_number: if the port is split, this is the number
- * of subport.
- * @switch_id: if the port is part of switch, this is buffer with ID,
- * otwerwise this is NULL
- * @switch_id_len: length of the switch_id buffer
+ * @attrs: devlink port attrs
*/
void devlink_port_attrs_set(struct devlink_port *devlink_port,
- enum devlink_port_flavour flavour,
- u32 port_number, bool split,
- u32 split_subport_number,
- const unsigned char *switch_id,
- unsigned char switch_id_len)
+ struct devlink_port_attrs *attrs)
{
- struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
- ret = __devlink_port_attrs_set(devlink_port, flavour,
- switch_id, switch_id_len);
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ devlink_port->attrs = *attrs;
+ ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
if (ret)
return;
- attrs->split = split;
- attrs->phys.port_number = port_number;
- attrs->phys.split_subport_number = split_subport_number;
+ WARN_ON(attrs->splittable && attrs->split);
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
@@ -6673,25 +10132,25 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
* devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
*
* @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
- * @switch_id: if the port is part of switch, this is buffer with ID,
- * otherwise this is NULL
- * @switch_id_len: length of the switch_id buffer
+ * @external: indicates if the port is for an external controller
*/
-void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port,
- const unsigned char *switch_id,
- unsigned char switch_id_len, u16 pf)
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
ret = __devlink_port_attrs_set(devlink_port,
- DEVLINK_PORT_FLAVOUR_PCI_PF,
- switch_id, switch_id_len);
+ DEVLINK_PORT_FLAVOUR_PCI_PF);
if (ret)
return;
-
+ attrs->pci_pf.controller = controller;
attrs->pci_pf.pf = pf;
+ attrs->pci_pf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
@@ -6699,62 +10158,230 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
* devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
*
* @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
* @vf: associated VF of a PF for the devlink port instance
- * @switch_id: if the port is part of switch, this is buffer with ID,
- * otherwise this is NULL
- * @switch_id_len: length of the switch_id buffer
+ * @external: indicates if the port is for an external controller
*/
-void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
- const unsigned char *switch_id,
- unsigned char switch_id_len,
- u16 pf, u16 vf)
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, u16 vf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
ret = __devlink_port_attrs_set(devlink_port,
- DEVLINK_PORT_FLAVOUR_PCI_VF,
- switch_id, switch_id_len);
+ DEVLINK_PORT_FLAVOUR_PCI_VF);
if (ret)
return;
+ attrs->pci_vf.controller = controller;
attrs->pci_vf.pf = pf;
attrs->pci_vf.vf = vf;
+ attrs->pci_vf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
+/**
+ * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes
+ *
+ * @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
+ * @pf: associated PF for the devlink port instance
+ * @sf: associated SF of a PF for the devlink port instance
+ * @external: indicates if the port is for an external controller
+ */
+void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, u32 sf, bool external)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int ret;
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ ret = __devlink_port_attrs_set(devlink_port,
+ DEVLINK_PORT_FLAVOUR_PCI_SF);
+ if (ret)
+ return;
+ attrs->pci_sf.controller = controller;
+ attrs->pci_sf.pf = pf;
+ attrs->pci_sf.sf = sf;
+ attrs->pci_sf.external = external;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
+
+/**
+ * devl_rate_leaf_create - create devlink rate leaf
+ * @devlink_port: devlink port object to create rate object on
+ * @priv: driver private data
+ *
+ * Create devlink rate object of type leaf on provided @devlink_port.
+ */
+int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_rate *devlink_rate;
+
+ devl_assert_locked(devlink_port->devlink);
+
+ if (WARN_ON(devlink_port->devlink_rate))
+ return -EBUSY;
+
+ devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL);
+ if (!devlink_rate)
+ return -ENOMEM;
+
+ devlink_rate->type = DEVLINK_RATE_TYPE_LEAF;
+ devlink_rate->devlink = devlink;
+ devlink_rate->devlink_port = devlink_port;
+ devlink_rate->priv = priv;
+ list_add_tail(&devlink_rate->list, &devlink->rate_list);
+ devlink_port->devlink_rate = devlink_rate;
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_rate_leaf_create);
+
+/**
+ * devl_rate_leaf_destroy - destroy devlink rate leaf
+ *
+ * @devlink_port: devlink port linked to the rate object
+ *
+ * Destroy the devlink rate object of type leaf on provided @devlink_port.
+ */
+void devl_rate_leaf_destroy(struct devlink_port *devlink_port)
+{
+ struct devlink_rate *devlink_rate = devlink_port->devlink_rate;
+
+ devl_assert_locked(devlink_port->devlink);
+ if (!devlink_rate)
+ return;
+
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL);
+ if (devlink_rate->parent)
+ refcount_dec(&devlink_rate->parent->refcnt);
+ list_del(&devlink_rate->list);
+ devlink_port->devlink_rate = NULL;
+ kfree(devlink_rate);
+}
+EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy);
+
+/**
+ * devl_rate_nodes_destroy - destroy all devlink rate nodes on device
+ * @devlink: devlink instance
+ *
+ * Unset parent for all rate objects and destroy all rate nodes
+ * on specified device.
+ */
+void devl_rate_nodes_destroy(struct devlink *devlink)
+{
+ static struct devlink_rate *devlink_rate, *tmp;
+ const struct devlink_ops *ops = devlink->ops;
+
+ devl_assert_locked(devlink);
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ if (!devlink_rate->parent)
+ continue;
+
+ refcount_dec(&devlink_rate->parent->refcnt);
+ if (devlink_rate_is_leaf(devlink_rate))
+ ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv,
+ NULL, NULL);
+ else if (devlink_rate_is_node(devlink_rate))
+ ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv,
+ NULL, NULL);
+ }
+ list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) {
+ if (devlink_rate_is_node(devlink_rate)) {
+ ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL);
+ list_del(&devlink_rate->list);
+ kfree(devlink_rate->name);
+ kfree(devlink_rate);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
+
+/**
+ * devlink_port_linecard_set - Link port with a linecard
+ *
+ * @devlink_port: devlink port
+ * @linecard: devlink linecard
+ */
+void devlink_port_linecard_set(struct devlink_port *devlink_port,
+ struct devlink_linecard *linecard)
+{
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ devlink_port->linecard = linecard;
+}
+EXPORT_SYMBOL_GPL(devlink_port_linecard_set);
+
static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
char *name, size_t len)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int n = 0;
- if (!attrs->set)
+ if (!devlink_port->attrs_set)
return -EOPNOTSUPP;
switch (attrs->flavour) {
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
- if (!attrs->split)
- n = snprintf(name, len, "p%u", attrs->phys.port_number);
- else
- n = snprintf(name, len, "p%us%u",
- attrs->phys.port_number,
- attrs->phys.split_subport_number);
+ if (devlink_port->linecard)
+ n = snprintf(name, len, "l%u",
+ devlink_port->linecard->index);
+ if (n < len)
+ n += snprintf(name + n, len - n, "p%u",
+ attrs->phys.port_number);
+ if (n < len && attrs->split)
+ n += snprintf(name + n, len - n, "s%u",
+ attrs->phys.split_subport_number);
break;
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
+ case DEVLINK_PORT_FLAVOUR_UNUSED:
/* As CPU and DSA ports do not have a netdevice associated
* case should not ever happen.
*/
WARN_ON(1);
return -EINVAL;
case DEVLINK_PORT_FLAVOUR_PCI_PF:
+ if (attrs->pci_pf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_pf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
break;
case DEVLINK_PORT_FLAVOUR_PCI_VF:
+ if (attrs->pci_vf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_vf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
n = snprintf(name, len, "pf%uvf%u",
attrs->pci_vf.pf, attrs->pci_vf.vf);
break;
+ case DEVLINK_PORT_FLAVOUR_PCI_SF:
+ if (attrs->pci_sf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_sf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
+ n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
+ attrs->pci_sf.sf);
+ break;
+ case DEVLINK_PORT_FLAVOUR_VIRTUAL:
+ return -EOPNOTSUPP;
}
if (n >= len)
@@ -6763,25 +10390,241 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
return 0;
}
-int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
- u32 size, u16 ingress_pools_count,
- u16 egress_pools_count, u16 ingress_tc_count,
- u16 egress_tc_count)
+static int devlink_linecard_types_init(struct devlink_linecard *linecard)
{
- struct devlink_sb *devlink_sb;
- int err = 0;
+ struct devlink_linecard_type *linecard_type;
+ unsigned int count;
+ int i;
- mutex_lock(&devlink->lock);
- if (devlink_sb_index_exists(devlink, sb_index)) {
- err = -EEXIST;
- goto unlock;
+ count = linecard->ops->types_count(linecard, linecard->priv);
+ linecard->types = kmalloc_array(count, sizeof(*linecard_type),
+ GFP_KERNEL);
+ if (!linecard->types)
+ return -ENOMEM;
+ linecard->types_count = count;
+
+ for (i = 0; i < count; i++) {
+ linecard_type = &linecard->types[i];
+ linecard->ops->types_get(linecard, linecard->priv, i,
+ &linecard_type->type,
+ &linecard_type->priv);
+ }
+ return 0;
+}
+
+static void devlink_linecard_types_fini(struct devlink_linecard *linecard)
+{
+ kfree(linecard->types);
+}
+
+/**
+ * devlink_linecard_create - Create devlink linecard
+ *
+ * @devlink: devlink
+ * @linecard_index: driver-specific numerical identifier of the linecard
+ * @ops: linecards ops
+ * @priv: user priv pointer
+ *
+ * Create devlink linecard instance with provided linecard index.
+ * Caller can use any indexing, even hw-related one.
+ *
+ * Return: Line card structure or an ERR_PTR() encoded error code.
+ */
+struct devlink_linecard *
+devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index,
+ const struct devlink_linecard_ops *ops, void *priv)
+{
+ struct devlink_linecard *linecard;
+ int err;
+
+ if (WARN_ON(!ops || !ops->provision || !ops->unprovision ||
+ !ops->types_count || !ops->types_get))
+ return ERR_PTR(-EINVAL);
+
+ mutex_lock(&devlink->linecards_lock);
+ if (devlink_linecard_index_exists(devlink, linecard_index)) {
+ mutex_unlock(&devlink->linecards_lock);
+ return ERR_PTR(-EEXIST);
}
- devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
- if (!devlink_sb) {
- err = -ENOMEM;
- goto unlock;
+ linecard = kzalloc(sizeof(*linecard), GFP_KERNEL);
+ if (!linecard) {
+ mutex_unlock(&devlink->linecards_lock);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ linecard->devlink = devlink;
+ linecard->index = linecard_index;
+ linecard->ops = ops;
+ linecard->priv = priv;
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ mutex_init(&linecard->state_lock);
+
+ err = devlink_linecard_types_init(linecard);
+ if (err) {
+ mutex_destroy(&linecard->state_lock);
+ kfree(linecard);
+ mutex_unlock(&devlink->linecards_lock);
+ return ERR_PTR(err);
+ }
+
+ list_add_tail(&linecard->list, &devlink->linecard_list);
+ refcount_set(&linecard->refcount, 1);
+ mutex_unlock(&devlink->linecards_lock);
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ return linecard;
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_create);
+
+/**
+ * devlink_linecard_destroy - Destroy devlink linecard
+ *
+ * @linecard: devlink linecard
+ */
+void devlink_linecard_destroy(struct devlink_linecard *linecard)
+{
+ struct devlink *devlink = linecard->devlink;
+
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
+ mutex_lock(&devlink->linecards_lock);
+ list_del(&linecard->list);
+ devlink_linecard_types_fini(linecard);
+ mutex_unlock(&devlink->linecards_lock);
+ devlink_linecard_put(linecard);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_destroy);
+
+/**
+ * devlink_linecard_provision_set - Set provisioning on linecard
+ *
+ * @linecard: devlink linecard
+ * @type: linecard type
+ *
+ * This is either called directly from the provision() op call or
+ * as a result of the provision() op call asynchronously.
+ */
+void devlink_linecard_provision_set(struct devlink_linecard *linecard,
+ const char *type)
+{
+ mutex_lock(&linecard->state_lock);
+ WARN_ON(linecard->type && strcmp(linecard->type, type));
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
+ linecard->type = type;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
+
+/**
+ * devlink_linecard_provision_clear - Clear provisioning on linecard
+ *
+ * @linecard: devlink linecard
+ *
+ * This is either called directly from the unprovision() op call or
+ * as a result of the unprovision() op call asynchronously.
+ */
+void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ WARN_ON(linecard->nested_devlink);
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear);
+
+/**
+ * devlink_linecard_provision_fail - Fail provisioning on linecard
+ *
+ * @linecard: devlink linecard
+ *
+ * This is either called directly from the provision() op call or
+ * as a result of the provision() op call asynchronously.
+ */
+void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ WARN_ON(linecard->nested_devlink);
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail);
+
+/**
+ * devlink_linecard_activate - Set linecard active
+ *
+ * @linecard: devlink linecard
+ */
+void devlink_linecard_activate(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED);
+ linecard->state = DEVLINK_LINECARD_STATE_ACTIVE;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_activate);
+
+/**
+ * devlink_linecard_deactivate - Set linecard inactive
+ *
+ * @linecard: devlink linecard
+ */
+void devlink_linecard_deactivate(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ switch (linecard->state) {
+ case DEVLINK_LINECARD_STATE_ACTIVE:
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ break;
+ case DEVLINK_LINECARD_STATE_UNPROVISIONING:
+ /* Line card is being deactivated as part
+ * of unprovisioning flow.
+ */
+ break;
+ default:
+ WARN_ON(1);
+ break;
}
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
+
+/**
+ * devlink_linecard_nested_dl_set - Attach/detach nested devlink
+ * instance to linecard.
+ *
+ * @linecard: devlink linecard
+ * @nested_devlink: devlink instance to attach or NULL to detach
+ */
+void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+ struct devlink *nested_devlink)
+{
+ mutex_lock(&linecard->state_lock);
+ linecard->nested_devlink = nested_devlink;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
+
+int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
+ u32 size, u16 ingress_pools_count,
+ u16 egress_pools_count, u16 ingress_tc_count,
+ u16 egress_tc_count)
+{
+ struct devlink_sb *devlink_sb;
+
+ lockdep_assert_held(&devlink->lock);
+
+ if (devlink_sb_index_exists(devlink, sb_index))
+ return -EEXIST;
+
+ devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
+ if (!devlink_sb)
+ return -ENOMEM;
devlink_sb->index = sb_index;
devlink_sb->size = size;
devlink_sb->ingress_pools_count = ingress_pools_count;
@@ -6789,57 +10632,78 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
devlink_sb->ingress_tc_count = ingress_tc_count;
devlink_sb->egress_tc_count = egress_tc_count;
list_add_tail(&devlink_sb->list, &devlink->sb_list);
-unlock:
- mutex_unlock(&devlink->lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_sb_register);
+
+int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
+ u32 size, u16 ingress_pools_count,
+ u16 egress_pools_count, u16 ingress_tc_count,
+ u16 egress_tc_count)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_sb_register(devlink, sb_index, size, ingress_pools_count,
+ egress_pools_count, ingress_tc_count,
+ egress_tc_count);
+ devl_unlock(devlink);
return err;
}
EXPORT_SYMBOL_GPL(devlink_sb_register);
-void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index)
{
struct devlink_sb *devlink_sb;
- mutex_lock(&devlink->lock);
+ lockdep_assert_held(&devlink->lock);
+
devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
WARN_ON(!devlink_sb);
list_del(&devlink_sb->list);
- mutex_unlock(&devlink->lock);
kfree(devlink_sb);
}
+EXPORT_SYMBOL_GPL(devl_sb_unregister);
+
+void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+{
+ devl_lock(devlink);
+ devl_sb_unregister(devlink, sb_index);
+ devl_unlock(devlink);
+}
EXPORT_SYMBOL_GPL(devlink_sb_unregister);
/**
- * devlink_dpipe_headers_register - register dpipe headers
+ * devl_dpipe_headers_register - register dpipe headers
*
- * @devlink: devlink
- * @dpipe_headers: dpipe header array
+ * @devlink: devlink
+ * @dpipe_headers: dpipe header array
*
- * Register the headers supported by hardware.
+ * Register the headers supported by hardware.
*/
-int devlink_dpipe_headers_register(struct devlink *devlink,
- struct devlink_dpipe_headers *dpipe_headers)
+void devl_dpipe_headers_register(struct devlink *devlink,
+ struct devlink_dpipe_headers *dpipe_headers)
{
- mutex_lock(&devlink->lock);
+ lockdep_assert_held(&devlink->lock);
+
devlink->dpipe_headers = dpipe_headers;
- mutex_unlock(&devlink->lock);
- return 0;
}
-EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
+EXPORT_SYMBOL_GPL(devl_dpipe_headers_register);
/**
- * devlink_dpipe_headers_unregister - unregister dpipe headers
+ * devl_dpipe_headers_unregister - unregister dpipe headers
*
- * @devlink: devlink
+ * @devlink: devlink
*
- * Unregister the headers supported by hardware.
+ * Unregister the headers supported by hardware.
*/
-void devlink_dpipe_headers_unregister(struct devlink *devlink)
+void devl_dpipe_headers_unregister(struct devlink *devlink)
{
- mutex_lock(&devlink->lock);
+ lockdep_assert_held(&devlink->lock);
+
devlink->dpipe_headers = NULL;
- mutex_unlock(&devlink->lock);
}
-EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister);
+EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister);
/**
* devlink_dpipe_table_counter_enabled - check if counter allocation
@@ -6873,38 +10737,33 @@ bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled);
/**
- * devlink_dpipe_table_register - register dpipe table
+ * devl_dpipe_table_register - register dpipe table
*
- * @devlink: devlink
- * @table_name: table name
- * @table_ops: table ops
- * @priv: priv
- * @counter_control_extern: external control for counters
+ * @devlink: devlink
+ * @table_name: table name
+ * @table_ops: table ops
+ * @priv: priv
+ * @counter_control_extern: external control for counters
*/
-int devlink_dpipe_table_register(struct devlink *devlink,
- const char *table_name,
- struct devlink_dpipe_table_ops *table_ops,
- void *priv, bool counter_control_extern)
+int devl_dpipe_table_register(struct devlink *devlink,
+ const char *table_name,
+ struct devlink_dpipe_table_ops *table_ops,
+ void *priv, bool counter_control_extern)
{
struct devlink_dpipe_table *table;
- int err = 0;
+
+ lockdep_assert_held(&devlink->lock);
if (WARN_ON(!table_ops->size_get))
return -EINVAL;
- mutex_lock(&devlink->lock);
-
if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name,
- devlink)) {
- err = -EEXIST;
- goto unlock;
- }
+ devlink))
+ return -EEXIST;
table = kzalloc(sizeof(*table), GFP_KERNEL);
- if (!table) {
- err = -ENOMEM;
- goto unlock;
- }
+ if (!table)
+ return -ENOMEM;
table->name = table_name;
table->table_ops = table_ops;
@@ -6912,73 +10771,69 @@ int devlink_dpipe_table_register(struct devlink *devlink,
table->counter_control_extern = counter_control_extern;
list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
-unlock:
- mutex_unlock(&devlink->lock);
- return err;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
+EXPORT_SYMBOL_GPL(devl_dpipe_table_register);
/**
- * devlink_dpipe_table_unregister - unregister dpipe table
+ * devl_dpipe_table_unregister - unregister dpipe table
*
- * @devlink: devlink
- * @table_name: table name
+ * @devlink: devlink
+ * @table_name: table name
*/
-void devlink_dpipe_table_unregister(struct devlink *devlink,
- const char *table_name)
+void devl_dpipe_table_unregister(struct devlink *devlink,
+ const char *table_name)
{
struct devlink_dpipe_table *table;
- mutex_lock(&devlink->lock);
+ lockdep_assert_held(&devlink->lock);
+
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
table_name, devlink);
if (!table)
- goto unlock;
+ return;
list_del_rcu(&table->list);
- mutex_unlock(&devlink->lock);
kfree_rcu(table, rcu);
- return;
-unlock:
- mutex_unlock(&devlink->lock);
}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
+EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister);
/**
- * devlink_resource_register - devlink resource register
+ * devl_resource_register - devlink resource register
*
- * @devlink: devlink
- * @resource_name: resource's name
- * @resource_size: resource's size
- * @resource_id: resource's id
- * @parent_resource_id: resource's parent id
- * @size_params: size parameters
+ * @devlink: devlink
+ * @resource_name: resource's name
+ * @resource_size: resource's size
+ * @resource_id: resource's id
+ * @parent_resource_id: resource's parent id
+ * @size_params: size parameters
+ *
+ * Generic resources should reuse the same names across drivers.
+ * Please see the generic resources list at:
+ * Documentation/networking/devlink/devlink-resource.rst
*/
-int devlink_resource_register(struct devlink *devlink,
- const char *resource_name,
- u64 resource_size,
- u64 resource_id,
- u64 parent_resource_id,
- const struct devlink_resource_size_params *size_params)
+int devl_resource_register(struct devlink *devlink,
+ const char *resource_name,
+ u64 resource_size,
+ u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *size_params)
{
struct devlink_resource *resource;
struct list_head *resource_list;
bool top_hierarchy;
- int err = 0;
+
+ lockdep_assert_held(&devlink->lock);
top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
- mutex_lock(&devlink->lock);
resource = devlink_resource_find(devlink, NULL, resource_id);
- if (resource) {
- err = -EINVAL;
- goto out;
- }
+ if (resource)
+ return -EINVAL;
resource = kzalloc(sizeof(*resource), GFP_KERNEL);
- if (!resource) {
- err = -ENOMEM;
- goto out;
- }
+ if (!resource)
+ return -ENOMEM;
if (top_hierarchy) {
resource_list = &devlink->resource_list;
@@ -6992,8 +10847,7 @@ int devlink_resource_register(struct devlink *devlink,
resource->parent = parent_resource;
} else {
kfree(resource);
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
}
@@ -7006,101 +10860,168 @@ int devlink_resource_register(struct devlink *devlink,
sizeof(resource->size_params));
INIT_LIST_HEAD(&resource->resource_list);
list_add_tail(&resource->list, resource_list);
-out:
- mutex_unlock(&devlink->lock);
- return err;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(devlink_resource_register);
+EXPORT_SYMBOL_GPL(devl_resource_register);
/**
- * devlink_resources_unregister - free all resources
+ * devlink_resource_register - devlink resource register
*
* @devlink: devlink
- * @resource: resource
+ * @resource_name: resource's name
+ * @resource_size: resource's size
+ * @resource_id: resource's id
+ * @parent_resource_id: resource's parent id
+ * @size_params: size parameters
+ *
+ * Generic resources should reuse the same names across drivers.
+ * Please see the generic resources list at:
+ * Documentation/networking/devlink/devlink-resource.rst
+ *
+ * Context: Takes and release devlink->lock <mutex>.
*/
-void devlink_resources_unregister(struct devlink *devlink,
- struct devlink_resource *resource)
+int devlink_resource_register(struct devlink *devlink,
+ const char *resource_name,
+ u64 resource_size,
+ u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *size_params)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_resource_register(devlink, resource_name, resource_size,
+ resource_id, parent_resource_id, size_params);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_register);
+
+static void devlink_resource_unregister(struct devlink *devlink,
+ struct devlink_resource *resource)
{
struct devlink_resource *tmp, *child_resource;
- struct list_head *resource_list;
- if (resource)
- resource_list = &resource->resource_list;
- else
- resource_list = &devlink->resource_list;
+ list_for_each_entry_safe(child_resource, tmp, &resource->resource_list,
+ list) {
+ devlink_resource_unregister(devlink, child_resource);
+ list_del(&child_resource->list);
+ kfree(child_resource);
+ }
+}
- if (!resource)
- mutex_lock(&devlink->lock);
+/**
+ * devl_resources_unregister - free all resources
+ *
+ * @devlink: devlink
+ */
+void devl_resources_unregister(struct devlink *devlink)
+{
+ struct devlink_resource *tmp, *child_resource;
+
+ lockdep_assert_held(&devlink->lock);
- list_for_each_entry_safe(child_resource, tmp, resource_list, list) {
- devlink_resources_unregister(devlink, child_resource);
+ list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list,
+ list) {
+ devlink_resource_unregister(devlink, child_resource);
list_del(&child_resource->list);
kfree(child_resource);
}
+}
+EXPORT_SYMBOL_GPL(devl_resources_unregister);
- if (!resource)
- mutex_unlock(&devlink->lock);
+/**
+ * devlink_resources_unregister - free all resources
+ *
+ * @devlink: devlink
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_resources_unregister(struct devlink *devlink)
+{
+ devl_lock(devlink);
+ devl_resources_unregister(devlink);
+ devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_resources_unregister);
/**
- * devlink_resource_size_get - get and update size
+ * devl_resource_size_get - get and update size
*
- * @devlink: devlink
- * @resource_id: the requested resource id
- * @p_resource_size: ptr to update
+ * @devlink: devlink
+ * @resource_id: the requested resource id
+ * @p_resource_size: ptr to update
*/
-int devlink_resource_size_get(struct devlink *devlink,
- u64 resource_id,
- u64 *p_resource_size)
+int devl_resource_size_get(struct devlink *devlink,
+ u64 resource_id,
+ u64 *p_resource_size)
{
struct devlink_resource *resource;
- int err = 0;
- mutex_lock(&devlink->lock);
+ lockdep_assert_held(&devlink->lock);
+
resource = devlink_resource_find(devlink, NULL, resource_id);
- if (!resource) {
- err = -EINVAL;
- goto out;
- }
+ if (!resource)
+ return -EINVAL;
*p_resource_size = resource->size_new;
resource->size = resource->size_new;
-out:
- mutex_unlock(&devlink->lock);
- return err;
+ return 0;
}
-EXPORT_SYMBOL_GPL(devlink_resource_size_get);
+EXPORT_SYMBOL_GPL(devl_resource_size_get);
/**
- * devlink_dpipe_table_resource_set - set the resource id
+ * devl_dpipe_table_resource_set - set the resource id
*
- * @devlink: devlink
- * @table_name: table name
- * @resource_id: resource id
- * @resource_units: number of resource's units consumed per table's entry
+ * @devlink: devlink
+ * @table_name: table name
+ * @resource_id: resource id
+ * @resource_units: number of resource's units consumed per table's entry
*/
-int devlink_dpipe_table_resource_set(struct devlink *devlink,
- const char *table_name, u64 resource_id,
- u64 resource_units)
+int devl_dpipe_table_resource_set(struct devlink *devlink,
+ const char *table_name, u64 resource_id,
+ u64 resource_units)
{
struct devlink_dpipe_table *table;
- int err = 0;
- mutex_lock(&devlink->lock);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
table_name, devlink);
- if (!table) {
- err = -EINVAL;
- goto out;
- }
+ if (!table)
+ return -EINVAL;
+
table->resource_id = resource_id;
table->resource_units = resource_units;
table->resource_valid = true;
-out:
- mutex_unlock(&devlink->lock);
- return err;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set);
+
+/**
+ * devl_resource_occ_get_register - register occupancy getter
+ *
+ * @devlink: devlink
+ * @resource_id: resource id
+ * @occ_get: occupancy getter callback
+ * @occ_get_priv: occupancy getter callback priv
+ */
+void devl_resource_occ_get_register(struct devlink *devlink,
+ u64 resource_id,
+ devlink_resource_occ_get_t *occ_get,
+ void *occ_get_priv)
+{
+ struct devlink_resource *resource;
+
+ lockdep_assert_held(&devlink->lock);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (WARN_ON(!resource))
+ return;
+ WARN_ON(resource->occ_get);
+
+ resource->occ_get = occ_get;
+ resource->occ_get_priv = occ_get_priv;
}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
+EXPORT_SYMBOL_GPL(devl_resource_occ_get_register);
/**
* devlink_resource_occ_get_register - register occupancy getter
@@ -7109,48 +11030,58 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
* @resource_id: resource id
* @occ_get: occupancy getter callback
* @occ_get_priv: occupancy getter callback priv
+ *
+ * Context: Takes and release devlink->lock <mutex>.
*/
void devlink_resource_occ_get_register(struct devlink *devlink,
u64 resource_id,
devlink_resource_occ_get_t *occ_get,
void *occ_get_priv)
{
+ devl_lock(devlink);
+ devl_resource_occ_get_register(devlink, resource_id,
+ occ_get, occ_get_priv);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register);
+
+/**
+ * devl_resource_occ_get_unregister - unregister occupancy getter
+ *
+ * @devlink: devlink
+ * @resource_id: resource id
+ */
+void devl_resource_occ_get_unregister(struct devlink *devlink,
+ u64 resource_id)
+{
struct devlink_resource *resource;
- mutex_lock(&devlink->lock);
+ lockdep_assert_held(&devlink->lock);
+
resource = devlink_resource_find(devlink, NULL, resource_id);
if (WARN_ON(!resource))
- goto out;
- WARN_ON(resource->occ_get);
+ return;
+ WARN_ON(!resource->occ_get);
- resource->occ_get = occ_get;
- resource->occ_get_priv = occ_get_priv;
-out:
- mutex_unlock(&devlink->lock);
+ resource->occ_get = NULL;
+ resource->occ_get_priv = NULL;
}
-EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register);
+EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister);
/**
* devlink_resource_occ_get_unregister - unregister occupancy getter
*
* @devlink: devlink
* @resource_id: resource id
+ *
+ * Context: Takes and release devlink->lock <mutex>.
*/
void devlink_resource_occ_get_unregister(struct devlink *devlink,
u64 resource_id)
{
- struct devlink_resource *resource;
-
- mutex_lock(&devlink->lock);
- resource = devlink_resource_find(devlink, NULL, resource_id);
- if (WARN_ON(!resource))
- goto out;
- WARN_ON(!resource->occ_get);
-
- resource->occ_get = NULL;
- resource->occ_get_priv = NULL;
-out:
- mutex_unlock(&devlink->lock);
+ devl_lock(devlink);
+ devl_resource_occ_get_unregister(devlink, resource_id);
+ devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
@@ -7164,61 +11095,6 @@ static int devlink_param_verify(const struct devlink_param *param)
return devlink_param_driver_verify(param);
}
-static int __devlink_params_register(struct devlink *devlink,
- unsigned int port_index,
- struct list_head *param_list,
- const struct devlink_param *params,
- size_t params_count,
- enum devlink_command reg_cmd,
- enum devlink_command unreg_cmd)
-{
- const struct devlink_param *param = params;
- int i;
- int err;
-
- mutex_lock(&devlink->lock);
- for (i = 0; i < params_count; i++, param++) {
- err = devlink_param_verify(param);
- if (err)
- goto rollback;
-
- err = devlink_param_register_one(devlink, port_index,
- param_list, param, reg_cmd);
- if (err)
- goto rollback;
- }
-
- mutex_unlock(&devlink->lock);
- return 0;
-
-rollback:
- if (!i)
- goto unlock;
- for (param--; i > 0; i--, param--)
- devlink_param_unregister_one(devlink, port_index, param_list,
- param, unreg_cmd);
-unlock:
- mutex_unlock(&devlink->lock);
- return err;
-}
-
-static void __devlink_params_unregister(struct devlink *devlink,
- unsigned int port_index,
- struct list_head *param_list,
- const struct devlink_param *params,
- size_t params_count,
- enum devlink_command cmd)
-{
- const struct devlink_param *param = params;
- int i;
-
- mutex_lock(&devlink->lock);
- for (i = 0; i < params_count; i++, param++)
- devlink_param_unregister_one(devlink, 0, param_list, param,
- cmd);
- mutex_unlock(&devlink->lock);
-}
-
/**
* devlink_params_register - register configuration parameters
*
@@ -7232,10 +11108,25 @@ int devlink_params_register(struct devlink *devlink,
const struct devlink_param *params,
size_t params_count)
{
- return __devlink_params_register(devlink, 0, &devlink->param_list,
- params, params_count,
- DEVLINK_CMD_PARAM_NEW,
- DEVLINK_CMD_PARAM_DEL);
+ const struct devlink_param *param = params;
+ int i, err;
+
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+
+ for (i = 0; i < params_count; i++, param++) {
+ err = devlink_param_register(devlink, param);
+ if (err)
+ goto rollback;
+ }
+ return 0;
+
+rollback:
+ if (!i)
+ return err;
+
+ for (param--; i > 0; i--, param--)
+ devlink_param_unregister(devlink, param);
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_params_register);
@@ -7249,145 +11140,70 @@ void devlink_params_unregister(struct devlink *devlink,
const struct devlink_param *params,
size_t params_count)
{
- return __devlink_params_unregister(devlink, 0, &devlink->param_list,
- params, params_count,
- DEVLINK_CMD_PARAM_DEL);
-}
-EXPORT_SYMBOL_GPL(devlink_params_unregister);
+ const struct devlink_param *param = params;
+ int i;
-/**
- * devlink_params_publish - publish configuration parameters
- *
- * @devlink: devlink
- *
- * Publish previously registered configuration parameters.
- */
-void devlink_params_publish(struct devlink *devlink)
-{
- struct devlink_param_item *param_item;
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
- list_for_each_entry(param_item, &devlink->param_list, list) {
- if (param_item->published)
- continue;
- param_item->published = true;
- devlink_param_notify(devlink, 0, param_item,
- DEVLINK_CMD_PARAM_NEW);
- }
+ for (i = 0; i < params_count; i++, param++)
+ devlink_param_unregister(devlink, param);
}
-EXPORT_SYMBOL_GPL(devlink_params_publish);
+EXPORT_SYMBOL_GPL(devlink_params_unregister);
/**
- * devlink_params_unpublish - unpublish configuration parameters
+ * devlink_param_register - register one configuration parameter
*
- * @devlink: devlink
+ * @devlink: devlink
+ * @param: one configuration parameter
*
- * Unpublish previously registered configuration parameters.
+ * Register the configuration parameter supported by the driver.
+ * Return: returns 0 on successful registration or error code otherwise.
*/
-void devlink_params_unpublish(struct devlink *devlink)
+int devlink_param_register(struct devlink *devlink,
+ const struct devlink_param *param)
{
struct devlink_param_item *param_item;
- list_for_each_entry(param_item, &devlink->param_list, list) {
- if (!param_item->published)
- continue;
- param_item->published = false;
- devlink_param_notify(devlink, 0, param_item,
- DEVLINK_CMD_PARAM_DEL);
- }
-}
-EXPORT_SYMBOL_GPL(devlink_params_unpublish);
-
-/**
- * devlink_port_params_register - register port configuration parameters
- *
- * @devlink_port: devlink port
- * @params: configuration parameters array
- * @params_count: number of parameters provided
- *
- * Register the configuration parameters supported by the port.
- */
-int devlink_port_params_register(struct devlink_port *devlink_port,
- const struct devlink_param *params,
- size_t params_count)
-{
- return __devlink_params_register(devlink_port->devlink,
- devlink_port->index,
- &devlink_port->param_list, params,
- params_count,
- DEVLINK_CMD_PORT_PARAM_NEW,
- DEVLINK_CMD_PORT_PARAM_DEL);
-}
-EXPORT_SYMBOL_GPL(devlink_port_params_register);
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
-/**
- * devlink_port_params_unregister - unregister port configuration
- * parameters
- *
- * @devlink_port: devlink port
- * @params: configuration parameters array
- * @params_count: number of parameters provided
- */
-void devlink_port_params_unregister(struct devlink_port *devlink_port,
- const struct devlink_param *params,
- size_t params_count)
-{
- return __devlink_params_unregister(devlink_port->devlink,
- devlink_port->index,
- &devlink_port->param_list,
- params, params_count,
- DEVLINK_CMD_PORT_PARAM_DEL);
-}
-EXPORT_SYMBOL_GPL(devlink_port_params_unregister);
+ WARN_ON(devlink_param_verify(param));
+ WARN_ON(devlink_param_find_by_name(&devlink->param_list, param->name));
-static int
-__devlink_param_driverinit_value_get(struct list_head *param_list, u32 param_id,
- union devlink_param_value *init_val)
-{
- struct devlink_param_item *param_item;
+ if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
+ WARN_ON(param->get || param->set);
+ else
+ WARN_ON(!param->get || !param->set);
- param_item = devlink_param_find_by_id(param_list, param_id);
+ param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
if (!param_item)
- return -EINVAL;
-
- if (!param_item->driverinit_value_valid ||
- !devlink_param_cmode_is_supported(param_item->param,
- DEVLINK_PARAM_CMODE_DRIVERINIT))
- return -EOPNOTSUPP;
+ return -ENOMEM;
- if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
- strcpy(init_val->vstr, param_item->driverinit_value.vstr);
- else
- *init_val = param_item->driverinit_value;
+ param_item->param = param;
+ list_add_tail(&param_item->list, &devlink->param_list);
return 0;
}
+EXPORT_SYMBOL_GPL(devlink_param_register);
-static int
-__devlink_param_driverinit_value_set(struct devlink *devlink,
- unsigned int port_index,
- struct list_head *param_list, u32 param_id,
- union devlink_param_value init_val,
- enum devlink_command cmd)
+/**
+ * devlink_param_unregister - unregister one configuration parameter
+ * @devlink: devlink
+ * @param: configuration parameter to unregister
+ */
+void devlink_param_unregister(struct devlink *devlink,
+ const struct devlink_param *param)
{
struct devlink_param_item *param_item;
- param_item = devlink_param_find_by_id(param_list, param_id);
- if (!param_item)
- return -EINVAL;
-
- if (!devlink_param_cmode_is_supported(param_item->param,
- DEVLINK_PARAM_CMODE_DRIVERINIT))
- return -EOPNOTSUPP;
-
- if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
- strcpy(param_item->driverinit_value.vstr, init_val.vstr);
- else
- param_item->driverinit_value = init_val;
- param_item->driverinit_value_valid = true;
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
- devlink_param_notify(devlink, port_index, param_item, cmd);
- return 0;
+ param_item =
+ devlink_param_find_by_name(&devlink->param_list, param->name);
+ WARN_ON(!param_item);
+ list_del(&param_item->list);
+ kfree(param_item);
}
+EXPORT_SYMBOL_GPL(devlink_param_unregister);
/**
* devlink_param_driverinit_value_get - get configuration parameter
@@ -7403,11 +11219,26 @@ __devlink_param_driverinit_value_set(struct devlink *devlink,
int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
union devlink_param_value *init_val)
{
- if (!devlink_reload_supported(devlink))
+ struct devlink_param_item *param_item;
+
+ if (!devlink_reload_supported(devlink->ops))
return -EOPNOTSUPP;
- return __devlink_param_driverinit_value_get(&devlink->param_list,
- param_id, init_val);
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ if (!param_item)
+ return -EINVAL;
+
+ if (!param_item->driverinit_value_valid ||
+ !devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT))
+ return -EOPNOTSUPP;
+
+ if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
+ strcpy(init_val->vstr, param_item->driverinit_value.vstr);
+ else
+ *init_val = param_item->driverinit_value;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);
@@ -7426,61 +11257,26 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);
int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
union devlink_param_value init_val)
{
- return __devlink_param_driverinit_value_set(devlink, 0,
- &devlink->param_list,
- param_id, init_val,
- DEVLINK_CMD_PARAM_NEW);
-}
-EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set);
+ struct devlink_param_item *param_item;
-/**
- * devlink_port_param_driverinit_value_get - get configuration parameter
- * value for driver initializing
- *
- * @devlink_port: devlink_port
- * @param_id: parameter ID
- * @init_val: value of parameter in driverinit configuration mode
- *
- * This function should be used by the driver to get driverinit
- * configuration for initialization after reload command.
- */
-int devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port,
- u32 param_id,
- union devlink_param_value *init_val)
-{
- struct devlink *devlink = devlink_port->devlink;
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
- if (!devlink_reload_supported(devlink))
- return -EOPNOTSUPP;
+ param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
+ if (!param_item)
+ return -EINVAL;
- return __devlink_param_driverinit_value_get(&devlink_port->param_list,
- param_id, init_val);
-}
-EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_get);
+ if (!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT))
+ return -EOPNOTSUPP;
-/**
- * devlink_port_param_driverinit_value_set - set value of configuration
- * parameter for driverinit
- * configuration mode
- *
- * @devlink_port: devlink_port
- * @param_id: parameter ID
- * @init_val: value of parameter to set for driverinit configuration mode
- *
- * This function should be used by the driver to set driverinit
- * configuration mode default value.
- */
-int devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port,
- u32 param_id,
- union devlink_param_value init_val)
-{
- return __devlink_param_driverinit_value_set(devlink_port->devlink,
- devlink_port->index,
- &devlink_port->param_list,
- param_id, init_val,
- DEVLINK_CMD_PORT_PARAM_NEW);
+ if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
+ strcpy(param_item->driverinit_value.vstr, init_val.vstr);
+ else
+ param_item->driverinit_value = init_val;
+ param_item->driverinit_value_valid = true;
+ return 0;
}
-EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_set);
+EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set);
/**
* devlink_param_value_changed - notify devlink on a parameter's value
@@ -7506,68 +11302,97 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
EXPORT_SYMBOL_GPL(devlink_param_value_changed);
/**
- * devlink_port_param_value_changed - notify devlink on a parameter's value
- * change. Should be called by the driver
- * right after the change.
- *
- * @devlink_port: devlink_port
- * @param_id: parameter ID
+ * devl_region_create - create a new address region
*
- * This function should be used by the driver to notify devlink on value
- * change, excluding driverinit configuration mode.
- * For driverinit configuration mode driver should use the function
- * devlink_port_param_driverinit_value_set() instead.
+ * @devlink: devlink
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
*/
-void devlink_port_param_value_changed(struct devlink_port *devlink_port,
- u32 param_id)
+struct devlink_region *devl_region_create(struct devlink *devlink,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots,
+ u64 region_size)
{
- struct devlink_param_item *param_item;
+ struct devlink_region *region;
- param_item = devlink_param_find_by_id(&devlink_port->param_list,
- param_id);
- WARN_ON(!param_item);
+ devl_assert_locked(devlink);
+
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
+ if (devlink_region_get_by_name(devlink, ops->name))
+ return ERR_PTR(-EEXIST);
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return ERR_PTR(-ENOMEM);
+
+ region->devlink = devlink;
+ region->max_snapshots = region_max_snapshots;
+ region->ops = ops;
+ region->size = region_size;
+ INIT_LIST_HEAD(&region->snapshot_list);
+ mutex_init(&region->snapshot_lock);
+ list_add_tail(&region->list, &devlink->region_list);
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
- devlink_param_notify(devlink_port->devlink, devlink_port->index,
- param_item, DEVLINK_CMD_PORT_PARAM_NEW);
+ return region;
}
-EXPORT_SYMBOL_GPL(devlink_port_param_value_changed);
+EXPORT_SYMBOL_GPL(devl_region_create);
/**
- * devlink_param_value_str_fill - Safely fill-up the string preventing
- * from overflow of the preallocated buffer
+ * devlink_region_create - create a new address region
*
- * @dst_val: destination devlink_param_value
- * @src: source buffer
+ * @devlink: devlink
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ *
+ * Context: Takes and release devlink->lock <mutex>.
*/
-void devlink_param_value_str_fill(union devlink_param_value *dst_val,
- const char *src)
+struct devlink_region *
+devlink_region_create(struct devlink *devlink,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
{
- size_t len;
+ struct devlink_region *region;
- len = strlcpy(dst_val->vstr, src, __DEVLINK_PARAM_MAX_STRING_VALUE);
- WARN_ON(len >= __DEVLINK_PARAM_MAX_STRING_VALUE);
+ devl_lock(devlink);
+ region = devl_region_create(devlink, ops, region_max_snapshots,
+ region_size);
+ devl_unlock(devlink);
+ return region;
}
-EXPORT_SYMBOL_GPL(devlink_param_value_str_fill);
+EXPORT_SYMBOL_GPL(devlink_region_create);
/**
- * devlink_region_create - create a new address region
+ * devlink_port_region_create - create a new address region for a port
*
- * @devlink: devlink
- * @region_name: region name
+ * @port: devlink port
+ * @ops: region operations and name
* @region_max_snapshots: Maximum supported number of snapshots for region
* @region_size: size of region
+ *
+ * Context: Takes and release devlink->lock <mutex>.
*/
-struct devlink_region *devlink_region_create(struct devlink *devlink,
- const char *region_name,
- u32 region_max_snapshots,
- u64 region_size)
+struct devlink_region *
+devlink_port_region_create(struct devlink_port *port,
+ const struct devlink_port_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
{
+ struct devlink *devlink = port->devlink;
struct devlink_region *region;
int err = 0;
- mutex_lock(&devlink->lock);
+ ASSERT_DEVLINK_PORT_INITIALIZED(port);
+
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
+ devl_lock(devlink);
- if (devlink_region_get_by_name(devlink, region_name)) {
+ if (devlink_port_region_get_by_name(port, ops->name)) {
err = -EEXIST;
goto unlock;
}
@@ -7579,44 +11404,63 @@ struct devlink_region *devlink_region_create(struct devlink *devlink,
}
region->devlink = devlink;
+ region->port = port;
region->max_snapshots = region_max_snapshots;
- region->name = region_name;
+ region->port_ops = ops;
region->size = region_size;
INIT_LIST_HEAD(&region->snapshot_list);
- list_add_tail(&region->list, &devlink->region_list);
+ mutex_init(&region->snapshot_lock);
+ list_add_tail(&region->list, &port->region_list);
devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
return region;
unlock:
- mutex_unlock(&devlink->lock);
+ devl_unlock(devlink);
return ERR_PTR(err);
}
-EXPORT_SYMBOL_GPL(devlink_region_create);
+EXPORT_SYMBOL_GPL(devlink_port_region_create);
/**
- * devlink_region_destroy - destroy address region
+ * devl_region_destroy - destroy address region
*
- * @region: devlink region to destroy
+ * @region: devlink region to destroy
*/
-void devlink_region_destroy(struct devlink_region *region)
+void devl_region_destroy(struct devlink_region *region)
{
struct devlink *devlink = region->devlink;
struct devlink_snapshot *snapshot, *ts;
- mutex_lock(&devlink->lock);
+ devl_assert_locked(devlink);
/* Free all snapshots of region */
list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
devlink_region_snapshot_del(region, snapshot);
list_del(&region->list);
+ mutex_destroy(&region->snapshot_lock);
devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
- mutex_unlock(&devlink->lock);
kfree(region);
}
+EXPORT_SYMBOL_GPL(devl_region_destroy);
+
+/**
+ * devlink_region_destroy - destroy address region
+ *
+ * @region: devlink region to destroy
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_region_destroy(struct devlink_region *region)
+{
+ struct devlink *devlink = region->devlink;
+
+ devl_lock(devlink);
+ devl_region_destroy(region);
+ devl_unlock(devlink);
+}
EXPORT_SYMBOL_GPL(devlink_region_destroy);
/**
@@ -7626,75 +11470,56 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy);
* Driver should use the same id for multiple snapshots taken
* on multiple regions at the same time/by the same trigger.
*
+ * The caller of this function must use devlink_region_snapshot_id_put
+ * when finished creating regions using this id.
+ *
+ * Returns zero on success, or a negative error code on failure.
+ *
* @devlink: devlink
+ * @id: storage to return id
*/
-u32 devlink_region_snapshot_id_get(struct devlink *devlink)
+int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
{
- u32 id;
-
- mutex_lock(&devlink->lock);
- id = ++devlink->snapshot_id;
- mutex_unlock(&devlink->lock);
-
- return id;
+ return __devlink_region_snapshot_id_get(devlink, id);
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get);
/**
+ * devlink_region_snapshot_id_put - put snapshot ID reference
+ *
+ * This should be called by a driver after finishing creating snapshots
+ * with an id. Doing so ensures that the ID can later be released in the
+ * event that all snapshots using it have been destroyed.
+ *
+ * @devlink: devlink
+ * @id: id to release reference on
+ */
+void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id)
+{
+ __devlink_snapshot_id_decrement(devlink, id);
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put);
+
+/**
* devlink_region_snapshot_create - create a new snapshot
* This will add a new snapshot of a region. The snapshot
* will be stored on the region struct and can be accessed
- * from devlink. This is useful for future analyses of snapshots.
+ * from devlink. This is useful for future analyses of snapshots.
* Multiple snapshots can be created on a region.
* The @snapshot_id should be obtained using the getter function.
*
* @region: devlink region of the snapshot
* @data: snapshot data
* @snapshot_id: snapshot id to be created
- * @data_destructor: pointer to destructor function to free data
*/
int devlink_region_snapshot_create(struct devlink_region *region,
- u8 *data, u32 snapshot_id,
- devlink_snapshot_data_dest_t *data_destructor)
+ u8 *data, u32 snapshot_id)
{
- struct devlink *devlink = region->devlink;
- struct devlink_snapshot *snapshot;
int err;
- mutex_lock(&devlink->lock);
-
- /* check if region can hold one more snapshot */
- if (region->cur_snapshots == region->max_snapshots) {
- err = -ENOMEM;
- goto unlock;
- }
-
- if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
- err = -EEXIST;
- goto unlock;
- }
-
- snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
- if (!snapshot) {
- err = -ENOMEM;
- goto unlock;
- }
-
- snapshot->id = snapshot_id;
- snapshot->region = region;
- snapshot->data = data;
- snapshot->data_destructor = data_destructor;
-
- list_add_tail(&snapshot->list, &region->snapshot_list);
-
- region->cur_snapshots++;
-
- devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
- mutex_unlock(&devlink->lock);
- return 0;
-
-unlock:
- mutex_unlock(&devlink->lock);
+ mutex_lock(&region->snapshot_lock);
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
+ mutex_unlock(&region->snapshot_lock);
return err;
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
@@ -7734,6 +11559,71 @@ static const struct devlink_trap devlink_trap_generic[] = {
DEVLINK_TRAP(NON_ROUTABLE, DROP),
DEVLINK_TRAP(DECAP_ERROR, EXCEPTION),
DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP),
+ DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP),
+ DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP),
+ DEVLINK_TRAP(STP, CONTROL),
+ DEVLINK_TRAP(LACP, CONTROL),
+ DEVLINK_TRAP(LLDP, CONTROL),
+ DEVLINK_TRAP(IGMP_QUERY, CONTROL),
+ DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL),
+ DEVLINK_TRAP(MLD_QUERY, CONTROL),
+ DEVLINK_TRAP(MLD_V1_REPORT, CONTROL),
+ DEVLINK_TRAP(MLD_V2_REPORT, CONTROL),
+ DEVLINK_TRAP(MLD_V1_DONE, CONTROL),
+ DEVLINK_TRAP(IPV4_DHCP, CONTROL),
+ DEVLINK_TRAP(IPV6_DHCP, CONTROL),
+ DEVLINK_TRAP(ARP_REQUEST, CONTROL),
+ DEVLINK_TRAP(ARP_RESPONSE, CONTROL),
+ DEVLINK_TRAP(ARP_OVERLAY, CONTROL),
+ DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL),
+ DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL),
+ DEVLINK_TRAP(IPV4_BFD, CONTROL),
+ DEVLINK_TRAP(IPV6_BFD, CONTROL),
+ DEVLINK_TRAP(IPV4_OSPF, CONTROL),
+ DEVLINK_TRAP(IPV6_OSPF, CONTROL),
+ DEVLINK_TRAP(IPV4_BGP, CONTROL),
+ DEVLINK_TRAP(IPV6_BGP, CONTROL),
+ DEVLINK_TRAP(IPV4_VRRP, CONTROL),
+ DEVLINK_TRAP(IPV6_VRRP, CONTROL),
+ DEVLINK_TRAP(IPV4_PIM, CONTROL),
+ DEVLINK_TRAP(IPV6_PIM, CONTROL),
+ DEVLINK_TRAP(UC_LB, CONTROL),
+ DEVLINK_TRAP(LOCAL_ROUTE, CONTROL),
+ DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL),
+ DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL),
+ DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL),
+ DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL),
+ DEVLINK_TRAP(IPV6_REDIRECT, CONTROL),
+ DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL),
+ DEVLINK_TRAP(PTP_EVENT, CONTROL),
+ DEVLINK_TRAP(PTP_GENERAL, CONTROL),
+ DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL),
+ DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL),
+ DEVLINK_TRAP(EARLY_DROP, DROP),
+ DEVLINK_TRAP(VXLAN_PARSING, DROP),
+ DEVLINK_TRAP(LLC_SNAP_PARSING, DROP),
+ DEVLINK_TRAP(VLAN_PARSING, DROP),
+ DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP),
+ DEVLINK_TRAP(MPLS_PARSING, DROP),
+ DEVLINK_TRAP(ARP_PARSING, DROP),
+ DEVLINK_TRAP(IP_1_PARSING, DROP),
+ DEVLINK_TRAP(IP_N_PARSING, DROP),
+ DEVLINK_TRAP(GRE_PARSING, DROP),
+ DEVLINK_TRAP(UDP_PARSING, DROP),
+ DEVLINK_TRAP(TCP_PARSING, DROP),
+ DEVLINK_TRAP(IPSEC_PARSING, DROP),
+ DEVLINK_TRAP(SCTP_PARSING, DROP),
+ DEVLINK_TRAP(DCCP_PARSING, DROP),
+ DEVLINK_TRAP(GTP_PARSING, DROP),
+ DEVLINK_TRAP(ESP_PARSING, DROP),
+ DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP),
+ DEVLINK_TRAP(DMAC_FILTER, DROP),
};
#define DEVLINK_TRAP_GROUP(_id) \
@@ -7745,8 +11635,30 @@ static const struct devlink_trap devlink_trap_generic[] = {
static const struct devlink_trap_group devlink_trap_group_generic[] = {
DEVLINK_TRAP_GROUP(L2_DROPS),
DEVLINK_TRAP_GROUP(L3_DROPS),
+ DEVLINK_TRAP_GROUP(L3_EXCEPTIONS),
DEVLINK_TRAP_GROUP(BUFFER_DROPS),
DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
+ DEVLINK_TRAP_GROUP(ACL_DROPS),
+ DEVLINK_TRAP_GROUP(STP),
+ DEVLINK_TRAP_GROUP(LACP),
+ DEVLINK_TRAP_GROUP(LLDP),
+ DEVLINK_TRAP_GROUP(MC_SNOOPING),
+ DEVLINK_TRAP_GROUP(DHCP),
+ DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY),
+ DEVLINK_TRAP_GROUP(BFD),
+ DEVLINK_TRAP_GROUP(OSPF),
+ DEVLINK_TRAP_GROUP(BGP),
+ DEVLINK_TRAP_GROUP(VRRP),
+ DEVLINK_TRAP_GROUP(PIM),
+ DEVLINK_TRAP_GROUP(UC_LB),
+ DEVLINK_TRAP_GROUP(LOCAL_DELIVERY),
+ DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY),
+ DEVLINK_TRAP_GROUP(IPV6),
+ DEVLINK_TRAP_GROUP(PTP_EVENT),
+ DEVLINK_TRAP_GROUP(PTP_GENERAL),
+ DEVLINK_TRAP_GROUP(ACL_SAMPLE),
+ DEVLINK_TRAP_GROUP(ACL_TRAP),
+ DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS),
};
static int devlink_trap_generic_verify(const struct devlink_trap *trap)
@@ -7780,7 +11692,7 @@ static int devlink_trap_driver_verify(const struct devlink_trap *trap)
static int devlink_trap_verify(const struct devlink_trap *trap)
{
- if (!trap || !trap->name || !trap->group.name)
+ if (!trap || !trap->name)
return -EINVAL;
if (trap->generic)
@@ -7835,6 +11747,8 @@ devlink_trap_group_notify(struct devlink *devlink,
WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW &&
cmd != DEVLINK_CMD_TRAP_GROUP_DEL);
+ if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+ return;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
@@ -7851,108 +11765,22 @@ devlink_trap_group_notify(struct devlink *devlink,
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
-static struct devlink_trap_group_item *
-devlink_trap_group_item_create(struct devlink *devlink,
- const struct devlink_trap_group *group)
-{
- struct devlink_trap_group_item *group_item;
- int err;
-
- err = devlink_trap_group_verify(group);
- if (err)
- return ERR_PTR(err);
-
- group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
- if (!group_item)
- return ERR_PTR(-ENOMEM);
-
- group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
- if (!group_item->stats) {
- err = -ENOMEM;
- goto err_stats_alloc;
- }
-
- group_item->group = group;
- refcount_set(&group_item->refcount, 1);
-
- if (devlink->ops->trap_group_init) {
- err = devlink->ops->trap_group_init(devlink, group);
- if (err)
- goto err_group_init;
- }
-
- list_add_tail(&group_item->list, &devlink->trap_group_list);
- devlink_trap_group_notify(devlink, group_item,
- DEVLINK_CMD_TRAP_GROUP_NEW);
-
- return group_item;
-
-err_group_init:
- free_percpu(group_item->stats);
-err_stats_alloc:
- kfree(group_item);
- return ERR_PTR(err);
-}
-
-static void
-devlink_trap_group_item_destroy(struct devlink *devlink,
- struct devlink_trap_group_item *group_item)
-{
- devlink_trap_group_notify(devlink, group_item,
- DEVLINK_CMD_TRAP_GROUP_DEL);
- list_del(&group_item->list);
- free_percpu(group_item->stats);
- kfree(group_item);
-}
-
-static struct devlink_trap_group_item *
-devlink_trap_group_item_get(struct devlink *devlink,
- const struct devlink_trap_group *group)
-{
- struct devlink_trap_group_item *group_item;
-
- group_item = devlink_trap_group_item_lookup(devlink, group->name);
- if (group_item) {
- refcount_inc(&group_item->refcount);
- return group_item;
- }
-
- return devlink_trap_group_item_create(devlink, group);
-}
-
-static void
-devlink_trap_group_item_put(struct devlink *devlink,
- struct devlink_trap_group_item *group_item)
-{
- if (!refcount_dec_and_test(&group_item->refcount))
- return;
-
- devlink_trap_group_item_destroy(devlink, group_item);
-}
-
static int
devlink_trap_item_group_link(struct devlink *devlink,
struct devlink_trap_item *trap_item)
{
+ u16 group_id = trap_item->trap->init_group_id;
struct devlink_trap_group_item *group_item;
- group_item = devlink_trap_group_item_get(devlink,
- &trap_item->trap->group);
- if (IS_ERR(group_item))
- return PTR_ERR(group_item);
+ group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id);
+ if (WARN_ON_ONCE(!group_item))
+ return -EINVAL;
trap_item->group_item = group_item;
return 0;
}
-static void
-devlink_trap_item_group_unlink(struct devlink *devlink,
- struct devlink_trap_item *trap_item)
-{
- devlink_trap_group_item_put(devlink, trap_item->group_item);
-}
-
static void devlink_trap_notify(struct devlink *devlink,
const struct devlink_trap_item *trap_item,
enum devlink_command cmd)
@@ -7962,6 +11790,8 @@ static void devlink_trap_notify(struct devlink *devlink,
WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW &&
cmd != DEVLINK_CMD_TRAP_DEL);
+ if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+ return;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
@@ -8015,7 +11845,6 @@ devlink_trap_register(struct devlink *devlink,
return 0;
err_trap_init:
- devlink_trap_item_group_unlink(devlink, trap_item);
err_group_link:
free_percpu(trap_item->stats);
err_stats_alloc:
@@ -8036,7 +11865,6 @@ static void devlink_trap_unregister(struct devlink *devlink,
list_del(&trap_item->list);
if (devlink->ops->trap_fini)
devlink->ops->trap_fini(devlink, trap, trap_item);
- devlink_trap_item_group_unlink(devlink, trap_item);
free_percpu(trap_item->stats);
kfree(trap_item);
}
@@ -8050,12 +11878,13 @@ static void devlink_trap_disable(struct devlink *devlink,
if (WARN_ON_ONCE(!trap_item))
return;
- devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP);
+ devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP,
+ NULL);
trap_item->action = DEVLINK_TRAP_ACTION_DROP;
}
/**
- * devlink_traps_register - Register packet traps with devlink.
+ * devl_traps_register - Register packet traps with devlink.
* @devlink: devlink.
* @traps: Packet traps.
* @traps_count: Count of provided packet traps.
@@ -8063,16 +11892,16 @@ static void devlink_trap_disable(struct devlink *devlink,
*
* Return: Non-zero value on failure.
*/
-int devlink_traps_register(struct devlink *devlink,
- const struct devlink_trap *traps,
- size_t traps_count, void *priv)
+int devl_traps_register(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count, void *priv)
{
int i, err;
if (!devlink->ops->trap_init || !devlink->ops->trap_action_set)
return -EINVAL;
- mutex_lock(&devlink->lock);
+ devl_assert_locked(devlink);
for (i = 0; i < traps_count; i++) {
const struct devlink_trap *trap = &traps[i];
@@ -8084,7 +11913,6 @@ int devlink_traps_register(struct devlink *devlink,
if (err)
goto err_trap_register;
}
- mutex_unlock(&devlink->lock);
return 0;
@@ -8092,24 +11920,47 @@ err_trap_register:
err_trap_verify:
for (i--; i >= 0; i--)
devlink_trap_unregister(devlink, &traps[i]);
- mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_traps_register);
+
+/**
+ * devlink_traps_register - Register packet traps with devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ * @priv: Driver private information.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_traps_register(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count, void *priv)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_traps_register(devlink, traps, traps_count, priv);
+ devl_unlock(devlink);
return err;
}
EXPORT_SYMBOL_GPL(devlink_traps_register);
/**
- * devlink_traps_unregister - Unregister packet traps from devlink.
+ * devl_traps_unregister - Unregister packet traps from devlink.
* @devlink: devlink.
* @traps: Packet traps.
* @traps_count: Count of provided packet traps.
*/
-void devlink_traps_unregister(struct devlink *devlink,
- const struct devlink_trap *traps,
- size_t traps_count)
+void devl_traps_unregister(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count)
{
int i;
- mutex_lock(&devlink->lock);
+ devl_assert_locked(devlink);
/* Make sure we do not have any packets in-flight while unregistering
* traps by disabling all of them and waiting for a grace period.
*/
@@ -8118,7 +11969,24 @@ void devlink_traps_unregister(struct devlink *devlink,
synchronize_rcu();
for (i = traps_count - 1; i >= 0; i--)
devlink_trap_unregister(devlink, &traps[i]);
- mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_traps_unregister);
+
+/**
+ * devlink_traps_unregister - Unregister packet traps from devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_traps_unregister(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count)
+{
+ devl_lock(devlink);
+ devl_traps_unregister(devlink, traps, traps_count);
+ devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_traps_unregister);
@@ -8130,24 +11998,25 @@ devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats,
stats = this_cpu_ptr(trap_stats);
u64_stats_update_begin(&stats->syncp);
- stats->rx_bytes += skb_len;
- stats->rx_packets++;
+ u64_stats_add(&stats->rx_bytes, skb_len);
+ u64_stats_inc(&stats->rx_packets);
u64_stats_update_end(&stats->syncp);
}
static void
-devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata,
- const struct devlink_trap_item *trap_item,
- struct devlink_port *in_devlink_port)
+devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata,
+ const struct devlink_trap_item *trap_item,
+ struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
{
- struct devlink_trap_group_item *group_item = trap_item->group_item;
-
- hw_metadata->trap_group_name = group_item->group->name;
- hw_metadata->trap_name = trap_item->trap->name;
+ metadata->trap_name = trap_item->trap->name;
+ metadata->trap_group_name = trap_item->group_item->group->name;
+ metadata->fa_cookie = fa_cookie;
+ metadata->trap_type = trap_item->trap->type;
spin_lock(&in_devlink_port->type_lock);
if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
- hw_metadata->input_dev = in_devlink_port->type_dev;
+ metadata->input_dev = in_devlink_port->type_dev;
spin_unlock(&in_devlink_port->type_lock);
}
@@ -8157,19 +12026,25 @@ devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata,
* @skb: Trapped packet.
* @trap_ctx: Trap context.
* @in_devlink_port: Input devlink port.
+ * @fa_cookie: Flow action cookie. Could be NULL.
*/
void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
- void *trap_ctx, struct devlink_port *in_devlink_port)
+ void *trap_ctx, struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
+
{
struct devlink_trap_item *trap_item = trap_ctx;
- struct net_dm_hw_metadata hw_metadata = {};
devlink_trap_stats_update(trap_item->stats, skb->len);
devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
- devlink_trap_report_metadata_fill(&hw_metadata, trap_item,
- in_devlink_port);
- net_dm_hw_report(skb, &hw_metadata);
+ if (trace_devlink_trap_report_enabled()) {
+ struct devlink_trap_metadata metadata = {};
+
+ devlink_trap_report_metadata_set(&metadata, trap_item,
+ in_devlink_port, fa_cookie);
+ trace_devlink_trap_report(devlink, skb, &metadata);
+ }
}
EXPORT_SYMBOL_GPL(devlink_trap_report);
@@ -8187,11 +12062,329 @@ void *devlink_trap_ctx_priv(void *trap_ctx)
}
EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv);
+static int
+devlink_trap_group_item_policer_link(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item)
+{
+ u32 policer_id = group_item->group->init_policer_id;
+ struct devlink_trap_policer_item *policer_item;
+
+ if (policer_id == 0)
+ return 0;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
+ if (WARN_ON_ONCE(!policer_item))
+ return -EINVAL;
+
+ group_item->policer_item = policer_item;
+
+ return 0;
+}
+
+static int
+devlink_trap_group_register(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+ int err;
+
+ if (devlink_trap_group_item_lookup(devlink, group->name))
+ return -EEXIST;
+
+ group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
+ if (!group_item)
+ return -ENOMEM;
+
+ group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+ if (!group_item->stats) {
+ err = -ENOMEM;
+ goto err_stats_alloc;
+ }
+
+ group_item->group = group;
+
+ err = devlink_trap_group_item_policer_link(devlink, group_item);
+ if (err)
+ goto err_policer_link;
+
+ if (devlink->ops->trap_group_init) {
+ err = devlink->ops->trap_group_init(devlink, group);
+ if (err)
+ goto err_group_init;
+ }
+
+ list_add_tail(&group_item->list, &devlink->trap_group_list);
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW);
+
+ return 0;
+
+err_group_init:
+err_policer_link:
+ free_percpu(group_item->stats);
+err_stats_alloc:
+ kfree(group_item);
+ return err;
+}
+
+static void
+devlink_trap_group_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+
+ group_item = devlink_trap_group_item_lookup(devlink, group->name);
+ if (WARN_ON_ONCE(!group_item))
+ return;
+
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_DEL);
+ list_del(&group_item->list);
+ free_percpu(group_item->stats);
+ kfree(group_item);
+}
+
+/**
+ * devl_trap_groups_register - Register packet trap groups with devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devl_trap_groups_register(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i, err;
+
+ devl_assert_locked(devlink);
+ for (i = 0; i < groups_count; i++) {
+ const struct devlink_trap_group *group = &groups[i];
+
+ err = devlink_trap_group_verify(group);
+ if (err)
+ goto err_trap_group_verify;
+
+ err = devlink_trap_group_register(devlink, group);
+ if (err)
+ goto err_trap_group_register;
+ }
+
+ return 0;
+
+err_trap_group_register:
+err_trap_group_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_trap_groups_register);
+
+/**
+ * devlink_trap_groups_register - Register packet trap groups with devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_trap_groups_register(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_trap_groups_register(devlink, groups, groups_count);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_register);
+
+/**
+ * devl_trap_groups_unregister - Unregister packet trap groups from devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ */
+void devl_trap_groups_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i;
+
+ devl_assert_locked(devlink);
+ for (i = groups_count - 1; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+}
+EXPORT_SYMBOL_GPL(devl_trap_groups_unregister);
+
+/**
+ * devlink_trap_groups_unregister - Unregister packet trap groups from devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_trap_groups_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ devl_lock(devlink);
+ devl_trap_groups_unregister(devlink, groups, groups_count);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister);
+
+static void
+devlink_trap_policer_notify(struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW &&
+ cmd != DEVLINK_CMD_TRAP_POLICER_DEL);
+ if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0,
+ 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int
+devlink_trap_policer_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+ int err;
+
+ if (devlink_trap_policer_item_lookup(devlink, policer->id))
+ return -EEXIST;
+
+ policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL);
+ if (!policer_item)
+ return -ENOMEM;
+
+ policer_item->policer = policer;
+ policer_item->rate = policer->init_rate;
+ policer_item->burst = policer->init_burst;
+
+ if (devlink->ops->trap_policer_init) {
+ err = devlink->ops->trap_policer_init(devlink, policer);
+ if (err)
+ goto err_policer_init;
+ }
+
+ list_add_tail(&policer_item->list, &devlink->trap_policer_list);
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW);
+
+ return 0;
+
+err_policer_init:
+ kfree(policer_item);
+ return err;
+}
+
+static void
+devlink_trap_policer_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer->id);
+ if (WARN_ON_ONCE(!policer_item))
+ return;
+
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_DEL);
+ list_del(&policer_item->list);
+ if (devlink->ops->trap_policer_fini)
+ devlink->ops->trap_policer_fini(devlink, policer);
+ kfree(policer_item);
+}
+
+/**
+ * devl_trap_policers_register - Register packet trap policers with devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ *
+ * Return: Non-zero value on failure.
+ */
+int
+devl_trap_policers_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i, err;
+
+ devl_assert_locked(devlink);
+ for (i = 0; i < policers_count; i++) {
+ const struct devlink_trap_policer *policer = &policers[i];
+
+ if (WARN_ON(policer->id == 0 ||
+ policer->max_rate < policer->min_rate ||
+ policer->max_burst < policer->min_burst)) {
+ err = -EINVAL;
+ goto err_trap_policer_verify;
+ }
+
+ err = devlink_trap_policer_register(devlink, policer);
+ if (err)
+ goto err_trap_policer_register;
+ }
+ return 0;
+
+err_trap_policer_register:
+err_trap_policer_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_trap_policers_register);
+
+/**
+ * devl_trap_policers_unregister - Unregister packet trap policers from devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ */
+void
+devl_trap_policers_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i;
+
+ devl_assert_locked(devlink);
+ for (i = policers_count - 1; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+}
+EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
+
static void __devlink_compat_running_version(struct devlink *devlink,
char *buf, size_t len)
{
+ struct devlink_info_req req = {};
const struct nlattr *nlattr;
- struct devlink_info_req req;
struct sk_buff *msg;
int rem, err;
@@ -8223,48 +12416,44 @@ free_msg:
nlmsg_free(msg);
}
-void devlink_compat_running_version(struct net_device *dev,
- char *buf, size_t len)
+static struct devlink_port *netdev_to_devlink_port(struct net_device *dev)
{
- struct devlink *devlink;
+ if (!dev->netdev_ops->ndo_get_devlink_port)
+ return NULL;
- dev_hold(dev);
- rtnl_unlock();
+ return dev->netdev_ops->ndo_get_devlink_port(dev);
+}
- devlink = netdev_to_devlink(dev);
- if (!devlink || !devlink->ops->info_get)
- goto out;
+void devlink_compat_running_version(struct devlink *devlink,
+ char *buf, size_t len)
+{
+ if (!devlink->ops->info_get)
+ return;
- mutex_lock(&devlink->lock);
+ devl_lock(devlink);
__devlink_compat_running_version(devlink, buf, len);
- mutex_unlock(&devlink->lock);
-
-out:
- rtnl_lock();
- dev_put(dev);
+ devl_unlock(devlink);
}
-int devlink_compat_flash_update(struct net_device *dev, const char *file_name)
+int devlink_compat_flash_update(struct devlink *devlink, const char *file_name)
{
- struct devlink *devlink;
+ struct devlink_flash_update_params params = {};
int ret;
- dev_hold(dev);
- rtnl_unlock();
+ if (!devlink->ops->flash_update)
+ return -EOPNOTSUPP;
- devlink = netdev_to_devlink(dev);
- if (!devlink || !devlink->ops->flash_update) {
- ret = -EOPNOTSUPP;
- goto out;
- }
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret)
+ return ret;
- mutex_lock(&devlink->lock);
- ret = devlink->ops->flash_update(devlink, file_name, NULL, NULL);
- mutex_unlock(&devlink->lock);
+ devl_lock(devlink);
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, NULL);
+ devlink_flash_update_end_notify(devlink);
+ devl_unlock(devlink);
-out:
- rtnl_lock();
- dev_put(dev);
+ release_firmware(params.fw);
return ret;
}
@@ -8297,7 +12486,7 @@ int devlink_compat_switch_id_get(struct net_device *dev,
* any devlink lock as only permanent values are accessed.
*/
devlink_port = netdev_to_devlink_port(dev);
- if (!devlink_port || !devlink_port->attrs.switch_port)
+ if (!devlink_port || !devlink_port->switch_port)
return -EOPNOTSUPP;
memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid));
@@ -8308,22 +12497,25 @@ int devlink_compat_switch_id_get(struct net_device *dev,
static void __net_exit devlink_pernet_pre_exit(struct net *net)
{
struct devlink *devlink;
+ u32 actions_performed;
+ unsigned long index;
int err;
/* In case network namespace is getting destroyed, reload
* all devlink instances from this namespace into init_net.
*/
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (net_eq(devlink_net(devlink), net)) {
- if (WARN_ON(!devlink_reload_supported(devlink)))
- continue;
- err = devlink_reload(devlink, &init_net, NULL);
- if (err && err != -EOPNOTSUPP)
- pr_warn("Failed to reload devlink instance into init_net\n");
- }
+ devlinks_xa_for_each_registered_get(net, index, devlink) {
+ WARN_ON(!(devlink->features & DEVLINK_F_RELOAD));
+ mutex_lock(&devlink->lock);
+ err = devlink_reload(devlink, &init_net,
+ DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ DEVLINK_RELOAD_LIMIT_UNSPEC,
+ &actions_performed, NULL);
+ mutex_unlock(&devlink->lock);
+ if (err && err != -EOPNOTSUPP)
+ pr_warn("Failed to reload devlink instance into init_net\n");
+ devlink_put(devlink);
}
- mutex_unlock(&devlink_mutex);
}
static struct pernet_operations devlink_pernet_ops __net_initdata = {
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 31700e0c3928..f084a4a6b7ab 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -26,12 +26,14 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/module.h>
-#include <net/drop_monitor.h>
#include <net/genetlink.h>
#include <net/netevent.h>
+#include <net/flow_offload.h>
+#include <net/devlink.h>
#include <trace/events/skb.h>
#include <trace/events/napi.h>
+#include <trace/events/devlink.h>
#include <asm/unaligned.h>
@@ -49,12 +51,11 @@ static bool monitor_hw;
/* net_dm_mutex
*
* An overall lock guarding every operation coming from userspace.
- * It also guards the global 'hw_stats_list' list.
*/
static DEFINE_MUTEX(net_dm_mutex);
struct net_dm_stats {
- u64 dropped;
+ u64_stats_t dropped;
struct u64_stats_sync syncp;
};
@@ -67,7 +68,7 @@ struct net_dm_hw_entry {
struct net_dm_hw_entries {
u32 num_entries;
- struct net_dm_hw_entry entries[0];
+ struct net_dm_hw_entry entries[];
};
struct per_cpu_dm_data {
@@ -85,11 +86,9 @@ struct per_cpu_dm_data {
};
struct dm_hw_stat_delta {
- struct net_device *dev;
unsigned long last_rx;
- struct list_head list;
- struct rcu_head rcu;
unsigned long last_drop_val;
+ struct rcu_head rcu;
};
static struct genl_family net_drop_monitor_family;
@@ -100,7 +99,6 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data);
static int dm_hit_limit = 64;
static int dm_delay = 1;
static unsigned long dm_hw_check_delta = 2*HZ;
-static LIST_HEAD(hw_stats_list);
static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
static u32 net_dm_trunc_len;
@@ -108,20 +106,23 @@ static u32 net_dm_queue_len = 1000;
struct net_dm_alert_ops {
void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
- void *location);
+ void *location,
+ enum skb_drop_reason reason);
void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
int work, int budget);
void (*work_item_func)(struct work_struct *work);
void (*hw_work_item_func)(struct work_struct *work);
- void (*hw_probe)(struct sk_buff *skb,
- const struct net_dm_hw_metadata *hw_metadata);
+ void (*hw_trap_probe)(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata);
};
struct net_dm_skb_cb {
union {
- struct net_dm_hw_metadata *hw_metadata;
+ struct devlink_trap_metadata *hw_metadata;
void *pc;
};
+ enum skb_drop_reason reason;
};
#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
@@ -212,6 +213,7 @@ static void sched_send_work(struct timer_list *t)
static void trace_drop_common(struct sk_buff *skb, void *location)
{
struct net_dm_alert_msg *msg;
+ struct net_dm_drop_point *point;
struct nlmsghdr *nlh;
struct nlattr *nla;
int i;
@@ -230,11 +232,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
nlh = (struct nlmsghdr *)dskb->data;
nla = genlmsg_data(nlmsg_data(nlh));
msg = nla_data(nla);
+ point = msg->points;
for (i = 0; i < msg->entries; i++) {
- if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
- msg->points[i].count++;
+ if (!memcmp(&location, &point->pc, sizeof(void *))) {
+ point->count++;
goto out;
}
+ point++;
}
if (msg->entries == dm_hit_limit)
goto out;
@@ -243,8 +247,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
*/
__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
- memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
- msg->points[msg->entries].count = 1;
+ memcpy(point->pc, &location, sizeof(void *));
+ point->count = 1;
msg->entries++;
if (!timer_pending(&data->send_timer)) {
@@ -256,7 +260,9 @@ out:
spin_unlock_irqrestore(&data->lock, flags);
}
-static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,
+ void *location,
+ enum skb_drop_reason reason)
{
trace_drop_common(skb, location);
}
@@ -264,29 +270,27 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *locatio
static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
int work, int budget)
{
- struct dm_hw_stat_delta *new_stat;
-
+ struct net_device *dev = napi->dev;
+ struct dm_hw_stat_delta *stat;
/*
* Don't check napi structures with no associated device
*/
- if (!napi->dev)
+ if (!dev)
return;
rcu_read_lock();
- list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+ stat = rcu_dereference(dev->dm_private);
+ if (stat) {
/*
* only add a note to our monitor buffer if:
- * 1) this is the dev we received on
- * 2) its after the last_rx delta
- * 3) our rx_dropped count has gone up
+ * 1) its after the last_rx delta
+ * 2) our rx_dropped count has gone up
*/
- if ((new_stat->dev == napi->dev) &&
- (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
- (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+ if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) &&
+ (dev->stats.rx_dropped != stat->last_drop_val)) {
trace_drop_common(NULL, NULL);
- new_stat->last_drop_val = napi->dev->stats.rx_dropped;
- new_stat->last_rx = jiffies;
- break;
+ stat->last_drop_val = dev->stats.rx_dropped;
+ stat->last_rx = jiffies;
}
}
rcu_read_unlock();
@@ -428,8 +432,9 @@ out:
}
static void
-net_dm_hw_summary_probe(struct sk_buff *skb,
- const struct net_dm_hw_metadata *hw_metadata)
+net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata)
{
struct net_dm_hw_entries *hw_entries;
struct net_dm_hw_entry *hw_entry;
@@ -437,6 +442,9 @@ net_dm_hw_summary_probe(struct sk_buff *skb,
unsigned long flags;
int i;
+ if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL)
+ return;
+
hw_data = this_cpu_ptr(&dm_hw_cpu_data);
spin_lock_irqsave(&hw_data->lock, flags);
hw_entries = hw_data->hw_entries;
@@ -446,7 +454,7 @@ net_dm_hw_summary_probe(struct sk_buff *skb,
for (i = 0; i < hw_entries->num_entries; i++) {
hw_entry = &hw_entries->entries[i];
- if (!strncmp(hw_entry->trap_name, hw_metadata->trap_name,
+ if (!strncmp(hw_entry->trap_name, metadata->trap_name,
NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) {
hw_entry->count++;
goto out;
@@ -456,7 +464,7 @@ net_dm_hw_summary_probe(struct sk_buff *skb,
goto out;
hw_entry = &hw_entries->entries[hw_entries->num_entries];
- strlcpy(hw_entry->trap_name, hw_metadata->trap_name,
+ strscpy(hw_entry->trap_name, metadata->trap_name,
NET_DM_MAX_HW_TRAP_NAME_LEN - 1);
hw_entry->count = 1;
hw_entries->num_entries++;
@@ -475,15 +483,17 @@ static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
.napi_poll_probe = trace_napi_poll_hit,
.work_item_func = send_dm_alert,
.hw_work_item_func = net_dm_hw_summary_work,
- .hw_probe = net_dm_hw_summary_probe,
+ .hw_trap_probe = net_dm_hw_trap_summary_probe,
};
static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
struct sk_buff *skb,
- void *location)
+ void *location,
+ enum skb_drop_reason reason)
{
ktime_t tstamp = ktime_get_real();
struct per_cpu_dm_data *data;
+ struct net_dm_skb_cb *cb;
struct sk_buff *nskb;
unsigned long flags;
@@ -494,7 +504,11 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
if (!nskb)
return;
- NET_DM_SKB_CB(nskb)->pc = location;
+ if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0))
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ cb = NET_DM_SKB_CB(nskb);
+ cb->reason = reason;
+ cb->pc = location;
/* Override the timestamp because we care about the time when the
* packet was dropped.
*/
@@ -516,7 +530,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
unlock_free:
spin_unlock_irqrestore(&data->drop_queue.lock, flags);
u64_stats_update_begin(&data->stats.syncp);
- data->stats.dropped++;
+ u64_stats_inc(&data->stats.dropped);
u64_stats_update_end(&data->stats.syncp);
consume_skb(nskb);
}
@@ -539,7 +553,8 @@ static size_t net_dm_in_port_size(void)
#define NET_DM_MAX_SYMBOL_LEN 40
-static size_t net_dm_packet_report_size(size_t payload_len)
+static size_t net_dm_packet_report_size(size_t payload_len,
+ enum skb_drop_reason reason)
{
size_t size;
@@ -560,6 +575,8 @@ static size_t net_dm_packet_report_size(size_t payload_len)
nla_total_size(sizeof(u32)) +
/* NET_DM_ATTR_PROTO */
nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_REASON */
+ nla_total_size(strlen(drop_reasons[reason]) + 1) +
/* NET_DM_ATTR_PAYLOAD */
nla_total_size(payload_len);
}
@@ -592,7 +609,7 @@ nla_put_failure:
static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
size_t payload_len)
{
- u64 pc = (u64)(uintptr_t) NET_DM_SKB_CB(skb)->pc;
+ struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
char buf[NET_DM_MAX_SYMBOL_LEN];
struct nlattr *attr;
void *hdr;
@@ -606,10 +623,15 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
goto nla_put_failure;
- if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, pc, NET_DM_ATTR_PAD))
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc,
+ NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, NET_DM_ATTR_REASON,
+ drop_reasons[cb->reason]))
goto nla_put_failure;
- snprintf(buf, sizeof(buf), "%pS", NET_DM_SKB_CB(skb)->pc);
+ snprintf(buf, sizeof(buf), "%pS", cb->pc);
if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
goto nla_put_failure;
@@ -665,7 +687,9 @@ static void net_dm_packet_report(struct sk_buff *skb)
if (net_dm_trunc_len)
payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
- msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+ msg = nlmsg_new(net_dm_packet_report_size(payload_len,
+ NET_DM_SKB_CB(skb)->reason),
+ GFP_KERNEL);
if (!msg)
goto out;
@@ -701,8 +725,15 @@ static void net_dm_packet_work(struct work_struct *work)
}
static size_t
+net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata)
+{
+ return hw_metadata->fa_cookie ?
+ nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0;
+}
+
+static size_t
net_dm_hw_packet_report_size(size_t payload_len,
- const struct net_dm_hw_metadata *hw_metadata)
+ const struct devlink_trap_metadata *hw_metadata)
{
size_t size;
@@ -717,6 +748,8 @@ net_dm_hw_packet_report_size(size_t payload_len,
nla_total_size(strlen(hw_metadata->trap_name) + 1) +
/* NET_DM_ATTR_IN_PORT */
net_dm_in_port_size() +
+ /* NET_DM_ATTR_FLOW_ACTION_COOKIE */
+ net_dm_flow_action_cookie_size(hw_metadata) +
/* NET_DM_ATTR_TIMESTAMP */
nla_total_size(sizeof(u64)) +
/* NET_DM_ATTR_ORIG_LEN */
@@ -730,7 +763,7 @@ net_dm_hw_packet_report_size(size_t payload_len,
static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
struct sk_buff *skb, size_t payload_len)
{
- struct net_dm_hw_metadata *hw_metadata;
+ struct devlink_trap_metadata *hw_metadata;
struct nlattr *attr;
void *hdr;
@@ -762,6 +795,12 @@ static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
goto nla_put_failure;
}
+ if (hw_metadata->fa_cookie &&
+ nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE,
+ hw_metadata->fa_cookie->cookie_len,
+ hw_metadata->fa_cookie->cookie))
+ goto nla_put_failure;
+
if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
goto nla_put_failure;
@@ -791,45 +830,59 @@ nla_put_failure:
return -EMSGSIZE;
}
-static struct net_dm_hw_metadata *
-net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata)
+static struct devlink_trap_metadata *
+net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata)
{
- struct net_dm_hw_metadata *n_hw_metadata;
+ const struct flow_action_cookie *fa_cookie;
+ struct devlink_trap_metadata *hw_metadata;
const char *trap_group_name;
const char *trap_name;
- n_hw_metadata = kmalloc(sizeof(*hw_metadata), GFP_ATOMIC);
- if (!n_hw_metadata)
+ hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC);
+ if (!hw_metadata)
return NULL;
- trap_group_name = kstrdup(hw_metadata->trap_group_name, GFP_ATOMIC);
+ trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC);
if (!trap_group_name)
goto free_hw_metadata;
- n_hw_metadata->trap_group_name = trap_group_name;
+ hw_metadata->trap_group_name = trap_group_name;
- trap_name = kstrdup(hw_metadata->trap_name, GFP_ATOMIC);
+ trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC);
if (!trap_name)
goto free_trap_group;
- n_hw_metadata->trap_name = trap_name;
+ hw_metadata->trap_name = trap_name;
+
+ if (metadata->fa_cookie) {
+ size_t cookie_size = sizeof(*fa_cookie) +
+ metadata->fa_cookie->cookie_len;
- n_hw_metadata->input_dev = hw_metadata->input_dev;
- if (n_hw_metadata->input_dev)
- dev_hold(n_hw_metadata->input_dev);
+ fa_cookie = kmemdup(metadata->fa_cookie, cookie_size,
+ GFP_ATOMIC);
+ if (!fa_cookie)
+ goto free_trap_name;
+ hw_metadata->fa_cookie = fa_cookie;
+ }
- return n_hw_metadata;
+ hw_metadata->input_dev = metadata->input_dev;
+ netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker,
+ GFP_ATOMIC);
+ return hw_metadata;
+
+free_trap_name:
+ kfree(trap_name);
free_trap_group:
kfree(trap_group_name);
free_hw_metadata:
- kfree(n_hw_metadata);
+ kfree(hw_metadata);
return NULL;
}
static void
-net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata)
+net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata)
{
- if (hw_metadata->input_dev)
- dev_put(hw_metadata->input_dev);
+ netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker);
+ kfree(hw_metadata->fa_cookie);
kfree(hw_metadata->trap_name);
kfree(hw_metadata->trap_group_name);
kfree(hw_metadata);
@@ -837,7 +890,7 @@ net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata)
static void net_dm_hw_packet_report(struct sk_buff *skb)
{
- struct net_dm_hw_metadata *hw_metadata;
+ struct devlink_trap_metadata *hw_metadata;
struct sk_buff *msg;
size_t payload_len;
int rc;
@@ -890,15 +943,19 @@ static void net_dm_hw_packet_work(struct work_struct *work)
}
static void
-net_dm_hw_packet_probe(struct sk_buff *skb,
- const struct net_dm_hw_metadata *hw_metadata)
+net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata)
{
- struct net_dm_hw_metadata *n_hw_metadata;
+ struct devlink_trap_metadata *n_hw_metadata;
ktime_t tstamp = ktime_get_real();
struct per_cpu_dm_data *hw_data;
struct sk_buff *nskb;
unsigned long flags;
+ if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL)
+ return;
+
if (!skb_mac_header_was_set(skb))
return;
@@ -906,7 +963,7 @@ net_dm_hw_packet_probe(struct sk_buff *skb,
if (!nskb)
return;
- n_hw_metadata = net_dm_hw_metadata_clone(hw_metadata);
+ n_hw_metadata = net_dm_hw_metadata_copy(metadata);
if (!n_hw_metadata)
goto free;
@@ -929,7 +986,7 @@ net_dm_hw_packet_probe(struct sk_buff *skb,
unlock_free:
spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
u64_stats_update_begin(&hw_data->stats.syncp);
- hw_data->stats.dropped++;
+ u64_stats_inc(&hw_data->stats.dropped);
u64_stats_update_end(&hw_data->stats.syncp);
net_dm_hw_metadata_free(n_hw_metadata);
free:
@@ -941,7 +998,7 @@ static const struct net_dm_alert_ops net_dm_alert_packet_ops = {
.napi_poll_probe = net_dm_packet_trace_napi_poll_hit,
.work_item_func = net_dm_packet_work,
.hw_work_item_func = net_dm_hw_packet_work,
- .hw_probe = net_dm_hw_packet_probe,
+ .hw_trap_probe = net_dm_hw_trap_packet_probe,
};
static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
@@ -949,25 +1006,32 @@ static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
[NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops,
};
-void net_dm_hw_report(struct sk_buff *skb,
- const struct net_dm_hw_metadata *hw_metadata)
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops)
{
- rcu_read_lock();
-
- if (!monitor_hw)
- goto out;
+ return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL);
+}
- net_dm_alert_ops_arr[net_dm_alert_mode]->hw_probe(skb, hw_metadata);
+static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops)
+{
+ unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL);
+ tracepoint_synchronize_unregister();
+}
+#else
+static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops)
+{
+ return -EOPNOTSUPP;
+}
-out:
- rcu_read_unlock();
+static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops)
+{
}
-EXPORT_SYMBOL_GPL(net_dm_hw_report);
+#endif
static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
{
const struct net_dm_alert_ops *ops;
- int cpu;
+ int cpu, rc;
if (monitor_hw) {
NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled");
@@ -991,13 +1055,38 @@ static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
kfree(hw_entries);
}
+ rc = net_dm_hw_probe_register(ops);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint");
+ goto err_module_put;
+ }
+
monitor_hw = true;
return 0;
+
+err_module_put:
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ del_timer_sync(&hw_data->send_timer);
+ cancel_work_sync(&hw_data->dm_alert_work);
+ while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+ struct devlink_trap_metadata *hw_metadata;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ net_dm_hw_metadata_free(hw_metadata);
+ consume_skb(skb);
+ }
+ }
+ module_put(THIS_MODULE);
+ return rc;
}
static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
{
+ const struct net_dm_alert_ops *ops;
int cpu;
if (!monitor_hw) {
@@ -1005,12 +1094,11 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
return;
}
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
monitor_hw = false;
- /* After this call returns we are guaranteed that no CPU is processing
- * any hardware drops.
- */
- synchronize_rcu();
+ net_dm_hw_probe_unregister(ops);
for_each_possible_cpu(cpu) {
struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
@@ -1019,7 +1107,7 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
del_timer_sync(&hw_data->send_timer);
cancel_work_sync(&hw_data->dm_alert_work);
while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
- struct net_dm_hw_metadata *hw_metadata;
+ struct devlink_trap_metadata *hw_metadata;
hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
net_dm_hw_metadata_free(hw_metadata);
@@ -1073,13 +1161,21 @@ static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
err_unregister_trace:
unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
err_module_put:
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ del_timer_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ while ((skb = __skb_dequeue(&data->drop_queue)))
+ consume_skb(skb);
+ }
module_put(THIS_MODULE);
return rc;
}
static void net_dm_trace_off_set(void)
{
- struct dm_hw_stat_delta *new_stat, *temp;
const struct net_dm_alert_ops *ops;
int cpu;
@@ -1103,13 +1199,6 @@ static void net_dm_trace_off_set(void)
consume_skb(skb);
}
- list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
- if (new_stat->dev == NULL) {
- list_del_rcu(&new_stat->list);
- kfree_rcu(new_stat, rcu);
- }
- }
-
module_put(THIS_MODULE);
}
@@ -1155,7 +1244,7 @@ static int net_dm_alert_mode_get_from_info(struct genl_info *info,
val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]);
switch (val) {
- case NET_DM_ALERT_MODE_SUMMARY: /* fall-through */
+ case NET_DM_ALERT_MODE_SUMMARY:
case NET_DM_ALERT_MODE_PACKET:
*p_alert_mode = val;
break;
@@ -1344,10 +1433,10 @@ static void net_dm_stats_read(struct net_dm_stats *stats)
do {
start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
- dropped = cpu_stats->dropped;
+ dropped = u64_stats_read(&cpu_stats->dropped);
} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
- stats->dropped += dropped;
+ u64_stats_add(&stats->dropped, dropped);
}
}
@@ -1363,7 +1452,7 @@ static int net_dm_stats_put(struct sk_buff *msg)
return -EMSGSIZE;
if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
- stats.dropped, NET_DM_ATTR_PAD))
+ u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD))
goto nla_put_failure;
nla_nest_end(msg, attr);
@@ -1388,10 +1477,10 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats)
do {
start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
- dropped = cpu_stats->dropped;
+ dropped = u64_stats_read(&cpu_stats->dropped);
} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
- stats->dropped += dropped;
+ u64_stats_add(&stats->dropped, dropped);
}
}
@@ -1407,7 +1496,7 @@ static int net_dm_hw_stats_put(struct sk_buff *msg)
return -EMSGSIZE;
if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
- stats.dropped, NET_DM_ATTR_PAD))
+ u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD))
goto nla_put_failure;
nla_nest_end(msg, attr);
@@ -1470,38 +1559,28 @@ static int dropmon_net_event(struct notifier_block *ev_block,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct dm_hw_stat_delta *new_stat = NULL;
- struct dm_hw_stat_delta *tmp;
+ struct dm_hw_stat_delta *stat;
switch (event) {
case NETDEV_REGISTER:
- new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+ if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private)))
+ break;
+ stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+ if (!stat)
+ break;
- if (!new_stat)
- goto out;
+ stat->last_rx = jiffies;
+ rcu_assign_pointer(dev->dm_private, stat);
- new_stat->dev = dev;
- new_stat->last_rx = jiffies;
- mutex_lock(&net_dm_mutex);
- list_add_rcu(&new_stat->list, &hw_stats_list);
- mutex_unlock(&net_dm_mutex);
break;
case NETDEV_UNREGISTER:
- mutex_lock(&net_dm_mutex);
- list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
- if (new_stat->dev == dev) {
- new_stat->dev = NULL;
- if (trace_state == TRACE_OFF) {
- list_del_rcu(&new_stat->list);
- kfree_rcu(new_stat, rcu);
- break;
- }
- }
+ stat = rtnl_dereference(dev->dm_private);
+ if (stat) {
+ rcu_assign_pointer(dev->dm_private, NULL);
+ kfree_rcu(stat, rcu);
}
- mutex_unlock(&net_dm_mutex);
break;
}
-out:
return NOTIFY_DONE;
}
@@ -1514,7 +1593,7 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
[NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG },
};
-static const struct genl_ops dropmon_ops[] = {
+static const struct genl_small_ops dropmon_ops[] = {
{
.cmd = NET_DM_CMD_CONFIG,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
@@ -1564,8 +1643,9 @@ static struct genl_family net_drop_monitor_family __ro_after_init = {
.pre_doit = net_dm_nl_pre_doit,
.post_doit = net_dm_nl_post_doit,
.module = THIS_MODULE,
- .ops = dropmon_ops,
- .n_ops = ARRAY_SIZE(dropmon_ops),
+ .small_ops = dropmon_ops,
+ .n_small_ops = ARRAY_SIZE(dropmon_ops),
+ .resv_start_op = NET_DM_CMD_STATS_GET + 1,
.mcgrps = dropmon_mcgrps,
.n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
};
@@ -1670,7 +1750,7 @@ static void exit_net_drop_monitor(void)
/*
* Because of the module_get/put we do in the trace state change path
- * we are guarnateed not to have any current users when we get here
+ * we are guaranteed not to have any current users when we get here
*/
for_each_possible_cpu(cpu) {
@@ -1687,3 +1767,4 @@ module_exit(exit_net_drop_monitor);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
MODULE_ALIAS_GENL_FAMILY("NET_DM");
+MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts");
diff --git a/net/core/dst.c b/net/core/dst.c
index 193af526e908..bc9c9be4e080 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -49,8 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
unsigned short flags)
{
dst->dev = dev;
- if (dev)
- dev_hold(dev);
+ netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC);
dst->ops = ops;
dst_init_metrics(dst, dst_default_metrics.metrics, true);
dst->expires = 0UL;
@@ -81,11 +80,11 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
{
struct dst_entry *dst;
- if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
+ if (ops->gc &&
+ !(flags & DST_NOCOUNT) &&
+ dst_entries_get_fast(ops) > ops->gc_thresh) {
if (ops->gc(ops)) {
- printk_ratelimited(KERN_NOTICE "Route cache is full: "
- "consider increasing sysctl "
- "net.ipv[4|6].route.max_size.\n");
+ pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n");
return NULL;
}
}
@@ -118,8 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
if (dst->ops->destroy)
dst->ops->destroy(dst);
- if (dst->dev)
- dev_put(dst->dev);
+ netdev_put(dst->dev, &dst->dev_tracker);
lwtstate_put(dst->lwtstate);
@@ -144,7 +142,7 @@ static void dst_destroy_rcu(struct rcu_head *head)
/* Operations to mark dst as DEAD and clean up the net device referenced
* by dst:
- * 1. put the dst under loopback interface and discard all tx/rx packets
+ * 1. put the dst under blackhole interface and discard all tx/rx packets
* on this route.
* 2. release the net_device
* This function should be called when removing routes from the fib tree
@@ -161,8 +159,8 @@ void dst_dev_put(struct dst_entry *dst)
dst->input = dst_discard;
dst->output = dst_discard_out;
dst->dev = blackhole_netdev;
- dev_hold(dst->dev);
- dev_put(dev);
+ netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
+ GFP_ATOMIC);
}
EXPORT_SYMBOL(dst_dev_put);
@@ -237,37 +235,62 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
-static struct dst_ops md_dst_ops = {
- .family = AF_UNSPEC,
-};
+struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie)
+{
+ return NULL;
+}
-static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old)
{
- WARN_ONCE(1, "Attempting to call output on metadata dst\n");
- kfree_skb(skb);
- return 0;
+ return NULL;
}
-static int dst_md_discard(struct sk_buff *skb)
+struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
{
- WARN_ONCE(1, "Attempting to call input on metadata dst\n");
- kfree_skb(skb);
- return 0;
+ return NULL;
+}
+
+void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
+{
+}
+EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu);
+
+void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
+{
+}
+EXPORT_SYMBOL_GPL(dst_blackhole_redirect);
+
+unsigned int dst_blackhole_mtu(const struct dst_entry *dst)
+{
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst->dev->mtu;
}
+EXPORT_SYMBOL_GPL(dst_blackhole_mtu);
+
+static struct dst_ops dst_blackhole_ops = {
+ .family = AF_UNSPEC,
+ .neigh_lookup = dst_blackhole_neigh_lookup,
+ .check = dst_blackhole_check,
+ .cow_metrics = dst_blackhole_cow_metrics,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
+};
static void __metadata_dst_init(struct metadata_dst *md_dst,
enum metadata_type type, u8 optslen)
-
{
struct dst_entry *dst;
dst = &md_dst->dst;
- dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
+ dst_init(dst, &dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE,
DST_METADATA | DST_NOCOUNT);
-
- dst->input = dst_md_discard;
- dst->output = dst_md_discard_out;
-
memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
md_dst->type = type;
}
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index be74ab4551c2..0ccfd5fa5cb9 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -162,3 +162,22 @@ void dst_cache_destroy(struct dst_cache *dst_cache)
free_percpu(dst_cache->cache);
}
EXPORT_SYMBOL_GPL(dst_cache_destroy);
+
+void dst_cache_reset_now(struct dst_cache *dst_cache)
+{
+ int i;
+
+ if (!dst_cache->cache)
+ return;
+
+ dst_cache->reset_ts = jiffies;
+ for_each_possible_cpu(i) {
+ struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);
+ struct dst_entry *dst = idst->dst;
+
+ idst->cookie = 0;
+ idst->dst = NULL;
+ dst_release(dst);
+ }
+}
+EXPORT_SYMBOL_GPL(dst_cache_reset_now);
diff --git a/net/core/failover.c b/net/core/failover.c
index b5cd3c727285..864d2d83eff4 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -252,7 +252,7 @@ struct failover *failover_register(struct net_device *dev,
return ERR_PTR(-ENOMEM);
rcu_assign_pointer(failover->ops, ops);
- dev_hold(dev);
+ netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL);
dev->priv_flags |= IFF_FAILOVER;
rcu_assign_pointer(failover->failover_dev, dev);
@@ -285,7 +285,7 @@ void failover_unregister(struct failover *failover)
failover_dev->name);
failover_dev->priv_flags &= ~IFF_FAILOVER;
- dev_put(failover_dev);
+ netdev_put(failover_dev, &failover->dev_tracker);
spin_lock(&failover_lock);
list_del(&failover->list);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index bd7eba9066f8..75282222e0b4 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -14,6 +14,20 @@
#include <net/sock.h>
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
+#include <linux/indirect_call_wrapper.h>
+
+#if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES)
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+#define INDIRECT_CALL_MT(f, f2, f1, ...) \
+ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
+#endif
+#elif defined(CONFIG_IP_MULTIPLE_TABLES)
+#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__)
+#endif
static const struct fib_kuid_range fib_kuid_range_unset = {
KUIDT_INIT(0),
@@ -43,7 +57,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
{
struct fib_rule *r;
- r = kzalloc(ops->rule_size, GFP_KERNEL);
+ r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
if (r == NULL)
return -ENOMEM;
@@ -267,7 +281,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
uid_gt(fl->flowi_uid, rule->uid_range.end))
goto out;
- ret = ops->match(rule, fl, flags);
+ ret = INDIRECT_CALL_MT(ops->match,
+ fib6_rule_match,
+ fib4_rule_match,
+ rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
}
@@ -298,9 +315,15 @@ jumped:
} else if (rule->action == FR_ACT_NOP)
continue;
else
- err = ops->action(rule, fl, flags, arg);
-
- if (!err && ops->suppress && ops->suppress(rule, arg))
+ err = INDIRECT_CALL_MT(ops->action,
+ fib6_rule_action,
+ fib4_rule_action,
+ rule, fl, flags, arg);
+
+ if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress,
+ fib6_rule_suppress,
+ fib4_rule_suppress,
+ rule, flags, arg))
continue;
if (err != -EAGAIN) {
@@ -518,7 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+ nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
if (!nlrule) {
err = -ENOMEM;
goto errout;
@@ -540,7 +563,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_device *dev;
nlrule->iifindex = -1;
- nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+ nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
dev = __dev_get_by_name(net, nlrule->iifname);
if (dev)
nlrule->iifindex = dev->ifindex;
@@ -550,7 +573,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_device *dev;
nlrule->oifindex = -1;
- nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+ nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
dev = __dev_get_by_name(net, nlrule->oifname);
if (dev)
nlrule->oifindex = dev->ifindex;
@@ -727,6 +750,27 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
return 0;
}
+static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = {
+ [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 },
+ [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [FRA_PRIORITY] = { .type = NLA_U32 },
+ [FRA_FWMARK] = { .type = NLA_U32 },
+ [FRA_FLOW] = { .type = NLA_U32 },
+ [FRA_TUN_ID] = { .type = NLA_U64 },
+ [FRA_FWMASK] = { .type = NLA_U32 },
+ [FRA_TABLE] = { .type = NLA_U32 },
+ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 },
+ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 },
+ [FRA_GOTO] = { .type = NLA_U32 },
+ [FRA_L3MDEV] = { .type = NLA_U8 },
+ [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) },
+ [FRA_PROTOCOL] = { .type = NLA_U8 },
+ [FRA_IP_PROTO] = { .type = NLA_U8 },
+ [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
+ [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }
+};
+
int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -751,7 +795,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
}
err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX,
- ops->policy, extack);
+ fib_rule_policy, extack);
if (err < 0) {
NL_SET_ERR_MSG(extack, "Error parsing msg");
goto errout;
@@ -859,7 +903,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
}
err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX,
- ops->policy, extack);
+ fib_rule_policy, extack);
if (err < 0) {
NL_SET_ERR_MSG(extack, "Error parsing msg");
goto errout;
@@ -1145,7 +1189,7 @@ static void notify_rule_change(int event, struct fib_rule *rule,
{
struct net *net;
struct sk_buff *skb;
- int err = -ENOBUFS;
+ int err = -ENOMEM;
net = ops->fro_net;
skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
diff --git a/net/core/filter.c b/net/core/filter.c
index c180871e606d..bb0136e7a8e4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -17,6 +17,8 @@
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
+#include <linux/atomic.h>
+#include <linux/bpf_verifier.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
@@ -41,12 +43,12 @@
#include <linux/timer.h>
#include <linux/uaccess.h>
#include <asm/unaligned.h>
-#include <asm/cmpxchg.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <net/sch_generic.h>
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
@@ -73,6 +75,37 @@
#include <net/lwtunnel.h>
#include <net/ipv6_stubs.h>
#include <net/bpf_sk_storage.h>
+#include <net/transp_v6.h>
+#include <linux/btf_ids.h>
+#include <net/tls.h>
+#include <net/xdp.h>
+#include <net/mptcp.h>
+
+static const struct bpf_func_proto *
+bpf_sk_base_func_proto(enum bpf_func_id func_id);
+
+int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
+{
+ if (in_compat_syscall()) {
+ struct compat_sock_fprog f32;
+
+ if (len != sizeof(f32))
+ return -EINVAL;
+ if (copy_from_sockptr(&f32, src, sizeof(f32)))
+ return -EFAULT;
+ memset(dst, 0, sizeof(*dst));
+ dst->len = f32.len;
+ dst->filter = compat_ptr(f32.filter);
+ } else {
+ if (len != sizeof(*dst))
+ return -EINVAL;
+ if (copy_from_sockptr(dst, src, sizeof(*dst)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -83,7 +116,7 @@
* Run the eBPF program and then cut skb->data to correct size returned by
* the program. If pkt_len is 0 we toss packet. If skb->len is smaller
* than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
+ * wrapper to bpf_prog_run. It returns 0 if the packet should
* be accepted or -EPERM if the packet should be tossed.
*
*/
@@ -205,7 +238,7 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u16 tmp, *ptr;
+ __be16 tmp, *ptr;
const int len = sizeof(tmp);
if (offset >= 0) {
@@ -232,7 +265,7 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u32 tmp, *ptr;
+ __be32 tmp, *ptr;
const int len = sizeof(tmp);
if (likely(offset >= 0)) {
@@ -256,17 +289,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
offset);
}
-BPF_CALL_0(bpf_get_raw_cpu_id)
-{
- return raw_smp_processor_id();
-}
-
-static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
- .func = bpf_get_raw_cpu_id,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
-};
-
static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
struct bpf_insn *insn_buf)
{
@@ -281,7 +303,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
break;
case SKF_AD_PKTTYPE:
- *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
@@ -303,7 +325,7 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
offsetof(struct sk_buff, vlan_tci));
break;
case SKF_AD_VLAN_TAG_PRESENT:
- *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET());
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET);
if (PKT_VLAN_PRESENT_BIT)
*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT);
if (PKT_VLAN_PRESENT_BIT < 7)
@@ -1193,10 +1215,11 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
u32 filter_size = bpf_prog_size(fp->prog->len);
+ int optmem_max = READ_ONCE(sysctl_optmem_max);
/* same check as in sock_kmalloc() */
- if (filter_size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
+ if (filter_size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
atomic_add(filter_size, &sk->sk_omem_alloc);
return true;
}
@@ -1222,10 +1245,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
int err, new_len, old_len = fp->len;
bool seen_ld_abs = false;
- /* We are free to overwrite insns et al right here as it
- * won't be used at this point in time anymore internally
- * after the migration to the internal BPF instruction
- * representation.
+ /* We are free to overwrite insns et al right here as it won't be used at
+ * this point in time anymore internally after the migration to the eBPF
+ * instruction representation.
*/
BUILD_BUG_ON(sizeof(struct sock_filter) !=
sizeof(struct bpf_insn));
@@ -1316,8 +1338,8 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
*/
bpf_jit_compile(fp);
- /* JIT compiler couldn't process this filter, so do the
- * internal BPF translation for the optimized interpreter.
+ /* JIT compiler couldn't process this filter, so do the eBPF translation
+ * for the optimized interpreter.
*/
if (!fp->jited)
fp = bpf_migrate_filter(fp);
@@ -1528,7 +1550,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+ if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max))
err = -ENOMEM;
else
err = reuseport_attach_prog(sk, prog);
@@ -1595,7 +1617,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
}
} else {
/* BPF_PROG_TYPE_SOCKET_FILTER */
- if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
+ if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) {
err = -ENOMEM;
goto err_prog_put;
}
@@ -1668,7 +1690,7 @@ BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
return -EINVAL;
- if (unlikely(offset > 0xffff))
+ if (unlikely(offset > INT_MAX))
return -EFAULT;
if (unlikely(bpf_try_make_writable(skb, offset + len)))
return -EFAULT;
@@ -1693,7 +1715,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE,
.arg5_type = ARG_ANYTHING,
};
@@ -1703,7 +1725,7 @@ BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
{
void *ptr;
- if (unlikely(offset > 0xffff))
+ if (unlikely(offset > INT_MAX))
goto err_clear;
ptr = skb_header_pointer(skb, offset, len, to);
@@ -1766,25 +1788,27 @@ BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
u32, offset, void *, to, u32, len, u32, start_header)
{
u8 *end = skb_tail_pointer(skb);
- u8 *net = skb_network_header(skb);
- u8 *mac = skb_mac_header(skb);
- u8 *ptr;
+ u8 *start, *ptr;
- if (unlikely(offset > 0xffff || len > (end - mac)))
+ if (unlikely(offset > 0xffff))
goto err_clear;
switch (start_header) {
case BPF_HDR_START_MAC:
- ptr = mac + offset;
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ goto err_clear;
+ start = skb_mac_header(skb);
break;
case BPF_HDR_START_NET:
- ptr = net + offset;
+ start = skb_network_header(skb);
break;
default:
goto err_clear;
}
- if (likely(ptr >= mac && ptr + len <= end)) {
+ ptr = start + offset;
+
+ if (likely(ptr + len <= end)) {
memcpy(to, ptr, len);
return 0;
}
@@ -1842,10 +1866,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = {
static inline int sk_skb_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
- int err = __bpf_try_make_writable(skb, write_len);
-
- bpf_compute_data_end_sk_skb(skb);
- return err;
+ return __bpf_try_make_writable(skb, write_len);
}
BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1999,9 +2020,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
.gpl_only = false,
.pkt_access = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
.arg5_type = ARG_ANYTHING,
};
@@ -2026,15 +2047,49 @@ static const struct bpf_func_proto bpf_csum_update_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
+{
+ /* The interface is to be used in combination with bpf_skb_adjust_room()
+ * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
+ * is passed as flags, for example.
+ */
+ switch (level) {
+ case BPF_CSUM_LEVEL_INC:
+ __skb_incr_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_DEC:
+ __skb_decr_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_RESET:
+ __skb_reset_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_QUERY:
+ return skb->ip_summed == CHECKSUM_UNNECESSARY ?
+ skb->csum_level : -EACCES;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_csum_level_proto = {
+ .func = bpf_csum_level,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
- return dev_forward_skb(dev, skb);
+ return dev_forward_skb_nomtu(dev, skb);
}
static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
struct sk_buff *skb)
{
- int ret = ____dev_forward_skb(dev, skb);
+ int ret = ____dev_forward_skb(dev, skb, false);
if (likely(!ret)) {
skb->dev = dev;
@@ -2055,7 +2110,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
}
skb->dev = dev;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
dev_xmit_recursion_inc();
ret = dev_queue_xmit(skb);
@@ -2109,13 +2164,249 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
return __bpf_redirect_no_mac(skb, dev, flags);
}
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
+ struct net_device *dev, struct bpf_nh_params *nh)
+{
+ u32 hh_len = LL_RESERVED_SPACE(dev);
+ const struct in6_addr *nexthop;
+ struct dst_entry *dst = NULL;
+ struct neighbour *neigh;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ goto out_drop;
+ }
+
+ skb->dev = dev;
+ skb_clear_tstamp(skb);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return -ENOMEM;
+ }
+
+ rcu_read_lock_bh();
+ if (!nh) {
+ dst = skb_dst(skb);
+ nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
+ &ipv6_hdr(skb)->daddr);
+ } else {
+ nexthop = &nh->ipv6_nh;
+ }
+ neigh = ip_neigh_gw6(dev, nexthop);
+ if (likely(!IS_ERR(neigh))) {
+ int ret;
+
+ sock_confirm_neigh(skb, neigh);
+ dev_xmit_recursion_inc();
+ ret = neigh_output(neigh, skb, false);
+ dev_xmit_recursion_dec();
+ rcu_read_unlock_bh();
+ return ret;
+ }
+ rcu_read_unlock_bh();
+ if (dst)
+ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+out_drop:
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct net *net = dev_net(dev);
+ int err, ret = NET_XMIT_DROP;
+
+ if (!nh) {
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_flags = FLOWI_FLAG_ANYSRC,
+ .flowi6_mark = skb->mark,
+ .flowlabel = ip6_flowinfo(ip6h),
+ .flowi6_oif = dev->ifindex,
+ .flowi6_proto = ip6h->nexthdr,
+ .daddr = ip6h->daddr,
+ .saddr = ip6h->saddr,
+ };
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
+ if (IS_ERR(dst))
+ goto out_drop;
+
+ skb_dst_set(skb, dst);
+ } else if (nh->nh_family != AF_INET6) {
+ goto out_drop;
+ }
+
+ err = bpf_out_neigh_v6(net, skb, dev, nh);
+ if (unlikely(net_xmit_eval(err)))
+ dev->stats.tx_errors++;
+ else
+ ret = NET_XMIT_SUCCESS;
+ goto out_xmit;
+out_drop:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+out_xmit:
+ return ret;
+}
+#else
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+#endif /* CONFIG_IPV6 */
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
+ struct net_device *dev, struct bpf_nh_params *nh)
+{
+ u32 hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
+ bool is_v6gw = false;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ goto out_drop;
+ }
+
+ skb->dev = dev;
+ skb_clear_tstamp(skb);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return -ENOMEM;
+ }
+
+ rcu_read_lock_bh();
+ if (!nh) {
+ struct dst_entry *dst = skb_dst(skb);
+ struct rtable *rt = container_of(dst, struct rtable, dst);
+
+ neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+ } else if (nh->nh_family == AF_INET6) {
+ neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
+ is_v6gw = true;
+ } else if (nh->nh_family == AF_INET) {
+ neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
+ } else {
+ rcu_read_unlock_bh();
+ goto out_drop;
+ }
+
+ if (likely(!IS_ERR(neigh))) {
+ int ret;
+
+ sock_confirm_neigh(skb, neigh);
+ dev_xmit_recursion_inc();
+ ret = neigh_output(neigh, skb, is_v6gw);
+ dev_xmit_recursion_dec();
+ rcu_read_unlock_bh();
+ return ret;
+ }
+ rcu_read_unlock_bh();
+out_drop:
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ const struct iphdr *ip4h = ip_hdr(skb);
+ struct net *net = dev_net(dev);
+ int err, ret = NET_XMIT_DROP;
+
+ if (!nh) {
+ struct flowi4 fl4 = {
+ .flowi4_flags = FLOWI_FLAG_ANYSRC,
+ .flowi4_mark = skb->mark,
+ .flowi4_tos = RT_TOS(ip4h->tos),
+ .flowi4_oif = dev->ifindex,
+ .flowi4_proto = ip4h->protocol,
+ .daddr = ip4h->daddr,
+ .saddr = ip4h->saddr,
+ };
+ struct rtable *rt;
+
+ rt = ip_route_output_flow(net, &fl4, NULL);
+ if (IS_ERR(rt))
+ goto out_drop;
+ if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+ ip_rt_put(rt);
+ goto out_drop;
+ }
+
+ skb_dst_set(skb, &rt->dst);
+ }
+
+ err = bpf_out_neigh_v4(net, skb, dev, nh);
+ if (unlikely(net_xmit_eval(err)))
+ dev->stats.tx_errors++;
+ else
+ ret = NET_XMIT_SUCCESS;
+ goto out_xmit;
+out_drop:
+ dev->stats.tx_errors++;
+ kfree_skb(skb);
+out_xmit:
+ return ret;
+}
+#else
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+#endif /* CONFIG_INET */
+
+static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ struct ethhdr *ethh = eth_hdr(skb);
+
+ if (unlikely(skb->mac_header >= skb->network_header))
+ goto out;
+ bpf_push_mac_rcsum(skb);
+ if (is_multicast_ether_addr(ethh->h_dest))
+ goto out;
+
+ skb_pull(skb, sizeof(*ethh));
+ skb_unset_mac_header(skb);
+ skb_reset_network_header(skb);
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return __bpf_redirect_neigh_v4(skb, dev, nh);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return __bpf_redirect_neigh_v6(skb, dev, nh);
+out:
+ kfree_skb(skb);
+ return -ENOTSUPP;
+}
+
+/* Internal, non-exposed redirect flags. */
+enum {
+ BPF_F_NEIGH = (1ULL << 1),
+ BPF_F_PEER = (1ULL << 2),
+ BPF_F_NEXTHOP = (1ULL << 3),
+#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
+};
+
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
struct net_device *dev;
struct sk_buff *clone;
int ret;
- if (unlikely(flags & ~(BPF_F_INGRESS)))
+ if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return -EINVAL;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
@@ -2152,11 +2443,46 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
+int skb_do_redirect(struct sk_buff *skb)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct net *net = dev_net(skb->dev);
+ struct net_device *dev;
+ u32 flags = ri->flags;
+
+ dev = dev_get_by_index_rcu(net, ri->tgt_index);
+ ri->tgt_index = 0;
+ ri->flags = 0;
+ if (unlikely(!dev))
+ goto out_drop;
+ if (flags & BPF_F_PEER) {
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (unlikely(!ops->ndo_get_peer_dev ||
+ !skb_at_tc_ingress(skb)))
+ goto out_drop;
+ dev = ops->ndo_get_peer_dev(dev);
+ if (unlikely(!dev ||
+ !(dev->flags & IFF_UP) ||
+ net_eq(net, dev_net(dev))))
+ goto out_drop;
+ skb->dev = dev;
+ return -EAGAIN;
+ }
+ return flags & BPF_F_NEIGH ?
+ __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
+ &ri->nh : NULL) :
+ __bpf_redirect(skb, dev, flags);
+out_drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- if (unlikely(flags & ~(BPF_F_INGRESS)))
+ if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return TC_ACT_SHOT;
ri->flags = flags;
@@ -2165,29 +2491,63 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
return TC_ACT_REDIRECT;
}
-int skb_do_redirect(struct sk_buff *skb)
+static const struct bpf_func_proto bpf_redirect_proto = {
+ .func = bpf_redirect,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct net_device *dev;
- dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
- ri->tgt_index = 0;
- if (unlikely(!dev)) {
- kfree_skb(skb);
- return -EINVAL;
- }
+ if (unlikely(flags))
+ return TC_ACT_SHOT;
- return __bpf_redirect(skb, dev, ri->flags);
+ ri->flags = BPF_F_PEER;
+ ri->tgt_index = ifindex;
+
+ return TC_ACT_REDIRECT;
}
-static const struct bpf_func_proto bpf_redirect_proto = {
- .func = bpf_redirect,
+static const struct bpf_func_proto bpf_redirect_peer_proto = {
+ .func = bpf_redirect_peer,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
+ int, plen, u64, flags)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+ if (unlikely((plen && plen < sizeof(*params)) || flags))
+ return TC_ACT_SHOT;
+
+ ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
+ ri->tgt_index = ifindex;
+
+ BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
+ if (plen)
+ memcpy(&ri->nh, params, sizeof(ri->nh));
+
+ return TC_ACT_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_neigh_proto = {
+ .func = bpf_redirect_neigh,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
msg->apply_bytes = bytes;
@@ -2246,7 +2606,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
- if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
+ if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
goto out;
/* At this point we need to linearize multiple scatterlist
@@ -2353,6 +2713,9 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
if (unlikely(flags))
return -EINVAL;
+ if (unlikely(len == 0))
+ return 0;
+
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
@@ -2452,7 +2815,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
- __clear_bit(new, &msg->sg.copy);
+ __clear_bit(new, msg->sg.copy);
sg_set_page(&msg->sg.data[new], page, len + copy, 0);
if (rsge.length) {
get_page(sg_page(&rsge));
@@ -2590,8 +2953,8 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
}
pop = 0;
} else if (pop >= sge->length - a) {
- sge->length = a;
pop -= (sge->length - a);
+ sge->length = a;
}
}
@@ -2642,6 +3005,36 @@ static const struct bpf_func_proto bpf_msg_pop_data_proto = {
.arg4_type = ARG_ANYTHING,
};
+#ifdef CONFIG_CGROUP_NET_CLASSID
+BPF_CALL_0(bpf_get_cgroup_classid_curr)
+{
+ return __task_get_classid(current);
+}
+
+const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+ .func = bpf_get_cgroup_classid_curr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ return sock_cgroup_classid(&sk->sk_cgrp_data);
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
+ .func = bpf_skb_cgroup_classid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+#endif
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -2837,9 +3230,6 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
- if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
- return -ENOTSUPP;
-
ret = skb_cow(skb, len_diff);
if (unlikely(ret < 0))
return ret;
@@ -2851,19 +3241,11 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* SKB_GSO_TCPV4 needs to be changed into
- * SKB_GSO_TCPV6.
- */
+ /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
if (shinfo->gso_type & SKB_GSO_TCPV4) {
shinfo->gso_type &= ~SKB_GSO_TCPV4;
shinfo->gso_type |= SKB_GSO_TCPV6;
}
-
- /* Due to IPv6 header, MSS needs to be downgraded. */
- skb_decrease_gso_size(shinfo, len_diff);
- /* Header must be checked, and gso_segs recomputed. */
- shinfo->gso_type |= SKB_GSO_DODGY;
- shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IPV6);
@@ -2878,9 +3260,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
- if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
- return -ENOTSUPP;
-
ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0))
return ret;
@@ -2892,19 +3271,11 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* SKB_GSO_TCPV6 needs to be changed into
- * SKB_GSO_TCPV4.
- */
+ /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
if (shinfo->gso_type & SKB_GSO_TCPV6) {
shinfo->gso_type &= ~SKB_GSO_TCPV6;
shinfo->gso_type |= SKB_GSO_TCPV4;
}
-
- /* Due to IPv4 header, MSS can be upgraded. */
- skb_increase_gso_size(shinfo, len_diff);
- /* Header must be checked, and gso_segs recomputed. */
- shinfo->gso_type |= SKB_GSO_DODGY;
- shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IP);
@@ -3005,6 +3376,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
BPF_F_ADJ_ROOM_ENCAP_L2( \
BPF_ADJ_ROOM_ENCAP_L2_MASK))
@@ -3041,6 +3413,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
return -EINVAL;
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+ inner_mac_len < ETH_HLEN)
+ return -EINVAL;
+
if (skb->encapsulation)
return -EALREADY;
@@ -3059,7 +3435,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
skb->inner_mac_header = inner_net - inner_mac_len;
skb->inner_network_header = inner_net;
skb->inner_transport_header = inner_trans;
- skb_set_inner_protocol(skb, skb->protocol);
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ else
+ skb_set_inner_protocol(skb, skb->protocol);
skb->encapsulation = 1;
skb_set_network_header(skb, mac_len);
@@ -3111,7 +3491,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
{
int ret;
- if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO)
+ if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
return -EINVAL;
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
@@ -3144,24 +3525,62 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
return 0;
}
-static u32 __bpf_skb_max_len(const struct sk_buff *skb)
+#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
+
+BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+ u32, mode, u64, flags)
{
- return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
- SKB_MAX_ALLOC;
+ u32 len_diff_abs = abs(len_diff);
+ bool shrink = len_diff < 0;
+ int ret = 0;
+
+ if (unlikely(flags || mode))
+ return -EINVAL;
+ if (unlikely(len_diff_abs > 0xfffU))
+ return -EFAULT;
+
+ if (!shrink) {
+ ret = skb_cow(skb, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+ __skb_push(skb, len_diff_abs);
+ memset(skb->data, 0, len_diff_abs);
+ } else {
+ if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
+ return -ENOMEM;
+ __skb_pull(skb, len_diff_abs);
+ }
+ if (tls_sw_has_ctx_rx(skb->sk)) {
+ struct strp_msg *rxm = strp_msg(skb);
+
+ rxm->full_len += len_diff;
+ }
+ return ret;
}
+static const struct bpf_func_proto sk_skb_adjust_room_proto = {
+ .func = sk_skb_adjust_room,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
u32 len_cur, len_diff_abs = abs(len_diff);
u32 len_min = bpf_skb_net_base_len(skb);
- u32 len_max = __bpf_skb_max_len(skb);
+ u32 len_max = BPF_SKB_MAX_LEN;
__be16 proto = skb->protocol;
bool shrink = len_diff < 0;
u32 off;
int ret;
- if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK))
+ if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
return -EINVAL;
if (unlikely(len_diff_abs > 0xfffU))
return -EFAULT;
@@ -3189,6 +3608,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
bpf_skb_net_grow(skb, off, len_diff_abs, flags);
+ if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
+ __skb_reset_checksum_unnecessary(skb);
bpf_compute_data_pointers(skb);
return ret;
@@ -3235,7 +3656,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
u64 flags)
{
- u32 max_len = __bpf_skb_max_len(skb);
+ u32 max_len = BPF_SKB_MAX_LEN;
u32 min_len = __bpf_skb_min_len(skb);
int ret;
@@ -3293,10 +3714,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
u64, flags)
{
- int ret = __bpf_skb_change_tail(skb, new_len, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_tail(skb, new_len, flags);
}
static const struct bpf_func_proto sk_skb_change_tail_proto = {
@@ -3311,7 +3729,7 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = {
static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
u64 flags)
{
- u32 max_len = __bpf_skb_max_len(skb);
+ u32 max_len = BPF_SKB_MAX_LEN;
u32 new_len = skb->len + head_room;
int ret;
@@ -3333,6 +3751,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
__skb_push(skb, head_room);
memset(skb->data, 0, head_room);
skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
}
return ret;
@@ -3359,10 +3778,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
u64, flags)
{
- int ret = __bpf_skb_change_head(skb, head_room, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_head(skb, head_room, flags);
}
static const struct bpf_func_proto sk_skb_change_head_proto = {
@@ -3373,6 +3789,28 @@ static const struct bpf_func_proto sk_skb_change_head_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
+
+BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
+{
+ return xdp_get_buff_len(xdp);
+}
+
+static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)
+
+const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0],
+};
+
static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
return xdp_data_meta_unsupported(xdp) ? 0 :
@@ -3407,17 +3845,225 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
+static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+ void *buf, unsigned long len, bool flush)
+{
+ unsigned long ptr_len, ptr_off = 0;
+ skb_frag_t *next_frag, *end_frag;
+ struct skb_shared_info *sinfo;
+ void *src, *dst;
+ u8 *ptr_buf;
+
+ if (likely(xdp->data_end - xdp->data >= off + len)) {
+ src = flush ? buf : xdp->data + off;
+ dst = flush ? xdp->data + off : buf;
+ memcpy(dst, src, len);
+ return;
+ }
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ end_frag = &sinfo->frags[sinfo->nr_frags];
+ next_frag = &sinfo->frags[0];
+
+ ptr_len = xdp->data_end - xdp->data;
+ ptr_buf = xdp->data;
+
+ while (true) {
+ if (off < ptr_off + ptr_len) {
+ unsigned long copy_off = off - ptr_off;
+ unsigned long copy_len = min(len, ptr_len - copy_off);
+
+ src = flush ? buf : ptr_buf + copy_off;
+ dst = flush ? ptr_buf + copy_off : buf;
+ memcpy(dst, src, copy_len);
+
+ off += copy_len;
+ len -= copy_len;
+ buf += copy_len;
+ }
+
+ if (!len || next_frag == end_frag)
+ break;
+
+ ptr_off += ptr_len;
+ ptr_buf = skb_frag_address(next_frag);
+ ptr_len = skb_frag_size(next_frag);
+ next_frag++;
+ }
+}
+
+static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ u32 size = xdp->data_end - xdp->data;
+ void *addr = xdp->data;
+ int i;
+
+ if (unlikely(offset > 0xffff || len > 0xffff))
+ return ERR_PTR(-EFAULT);
+
+ if (offset + len > xdp_get_buff_len(xdp))
+ return ERR_PTR(-EINVAL);
+
+ if (offset < size) /* linear area */
+ goto out;
+
+ offset -= size;
+ for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
+ u32 frag_size = skb_frag_size(&sinfo->frags[i]);
+
+ if (offset < frag_size) {
+ addr = skb_frag_address(&sinfo->frags[i]);
+ size = frag_size;
+ break;
+ }
+ offset -= frag_size;
+ }
+out:
+ return offset + len <= size ? addr + offset : NULL;
+}
+
+BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, false);
+ else
+ memcpy(buf, ptr, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
+ .func = bpf_xdp_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, true);
+ else
+ memcpy(ptr, buf, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
+ .func = bpf_xdp_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
+ struct xdp_rxq_info *rxq = xdp->rxq;
+ unsigned int tailroom;
+
+ if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
+ return -EOPNOTSUPP;
+
+ tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
+ if (unlikely(offset > tailroom))
+ return -EINVAL;
+
+ memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
+ skb_frag_size_add(frag, offset);
+ sinfo->xdp_frags_size += offset;
+
+ return 0;
+}
+
+static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ int i, n_frags_free = 0, len_free = 0;
+
+ if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
+ return -EINVAL;
+
+ for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
+ skb_frag_t *frag = &sinfo->frags[i];
+ int shrink = min_t(int, offset, skb_frag_size(frag));
+
+ len_free += shrink;
+ offset -= shrink;
+
+ if (skb_frag_size(frag) == shrink) {
+ struct page *page = skb_frag_page(frag);
+
+ __xdp_return(page_address(page), &xdp->rxq->mem,
+ false, NULL);
+ n_frags_free++;
+ } else {
+ skb_frag_size_sub(frag, shrink);
+ break;
+ }
+ }
+ sinfo->nr_frags -= n_frags_free;
+ sinfo->xdp_frags_size -= len_free;
+
+ if (unlikely(!sinfo->nr_frags)) {
+ xdp_buff_clear_frags_flag(xdp);
+ xdp->data_end -= offset;
+ }
+
+ return 0;
+}
+
BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
+ void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
void *data_end = xdp->data_end + offset;
- /* only shrinking is allowed for now. */
- if (unlikely(offset >= 0))
+ if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
+ if (offset < 0)
+ return bpf_xdp_frags_shrink_tail(xdp, -offset);
+
+ return bpf_xdp_frags_increase_tail(xdp, offset);
+ }
+
+ /* Notice that xdp_data_hard_end have reserved some tailroom */
+ if (unlikely(data_end > data_hard_end))
+ return -EINVAL;
+
+ /* ALL drivers MUST init xdp->frame_sz, chicken check below */
+ if (unlikely(xdp->frame_sz > PAGE_SIZE)) {
+ WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz);
return -EINVAL;
+ }
if (unlikely(data_end < xdp->data + ETH_HLEN))
return -EINVAL;
+ /* Clear memory area on grow, can contain uninit kernel memory */
+ if (offset > 0)
+ memset(xdp->data_end, 0, offset);
+
xdp->data_end = data_end;
return 0;
@@ -3442,8 +4088,7 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
- if (unlikely((metalen & (sizeof(__u32) - 1)) ||
- (metalen > 32)))
+ if (unlikely(xdp_metalen_invalid(metalen)))
return -EACCES;
xdp->data_meta = meta;
@@ -3459,23 +4104,34 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING,
};
-static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
- struct bpf_map *map, struct xdp_buff *xdp)
-{
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- case BPF_MAP_TYPE_DEVMAP_HASH:
- return dev_map_enqueue(fwd, xdp, dev_rx);
- case BPF_MAP_TYPE_CPUMAP:
- return cpu_map_enqueue(fwd, xdp, dev_rx);
- case BPF_MAP_TYPE_XSKMAP:
- return __xsk_map_redirect(fwd, xdp);
- default:
- return -EBADRQC;
- }
- return 0;
-}
-
+/* XDP_REDIRECT works by a three-step process, implemented in the functions
+ * below:
+ *
+ * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
+ * of the redirect and store it (along with some other metadata) in a per-CPU
+ * struct bpf_redirect_info.
+ *
+ * 2. When the program returns the XDP_REDIRECT return code, the driver will
+ * call xdp_do_redirect() which will use the information in struct
+ * bpf_redirect_info to actually enqueue the frame into a map type-specific
+ * bulk queue structure.
+ *
+ * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(),
+ * which will flush all the different bulk queues, thus completing the
+ * redirect.
+ *
+ * Pointers to the map entries will be kept around for this whole sequence of
+ * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
+ * the core code; instead, the RCU protection relies on everything happening
+ * inside a single NAPI poll sequence, which means it's between a pair of calls
+ * to local_bh_disable()/local_bh_enable().
+ *
+ * The map entries are marked as __rcu and the map code makes sure to
+ * dereference those pointers with rcu_dereference_check() in a way that works
+ * for both sections that to hold an rcu_read_lock() and sections that are
+ * called from NAPI without a separate rcu_read_lock(). The code below does not
+ * use RCU annotations, but relies on those in the map code.
+ */
void xdp_do_flush(void)
{
__dev_flush();
@@ -3484,22 +4140,6 @@ void xdp_do_flush(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
-static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
-{
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- return __dev_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_DEVMAP_HASH:
- return __dev_map_hash_lookup_elem(map, index);
- case BPF_MAP_TYPE_CPUMAP:
- return __cpu_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_XSKMAP:
- return __xsk_map_lookup_elem(map, index);
- default:
- return NULL;
- }
-}
-
void bpf_clear_redirect_map(struct bpf_map *map)
{
struct bpf_redirect_info *ri;
@@ -3517,81 +4157,196 @@ void bpf_clear_redirect_map(struct bpf_map *map)
}
}
-int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
+EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
+
+u32 xdp_master_redirect(struct xdp_buff *xdp)
{
+ struct net_device *master, *slave;
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->tgt_index;
+
+ master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
+ slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
+ if (slave && slave != xdp->rxq->dev) {
+ /* The target device is different from the receiving device, so
+ * redirect it to the new device.
+ * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
+ * drivers to unmap the packet from their rx ring.
+ */
+ ri->tgt_index = slave->ifindex;
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+ return XDP_REDIRECT;
+ }
+ return XDP_TX;
+}
+EXPORT_SYMBOL_GPL(xdp_master_redirect);
+
+static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
+ struct net_device *dev,
+ struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
int err;
- ri->tgt_index = 0;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- if (unlikely(!map)) {
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
+ err = __xsk_map_redirect(fwd, xdp);
+ if (unlikely(err))
+ goto err;
- err = dev_xdp_enqueue(fwd, xdp, dev);
- } else {
- err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
+ return 0;
+err:
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
+ return err;
+}
+
+static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
+ struct net_device *dev,
+ struct xdp_frame *xdpf,
+ struct bpf_prog *xdp_prog)
+{
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ struct bpf_map *map;
+ int err;
+
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+
+ if (unlikely(!xdpf)) {
+ err = -EOVERFLOW;
+ goto err;
+ }
+
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ map = READ_ONCE(ri->map);
+ if (unlikely(map)) {
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_enqueue_multi(xdpf, dev, map,
+ ri->flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_enqueue(fwd, xdpf, dev);
+ }
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_enqueue(fwd, xdpf, dev);
+ break;
+ case BPF_MAP_TYPE_UNSPEC:
+ if (map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ break;
+ }
+ err = dev_xdp_enqueue(fwd, xdpf, dev);
+ break;
+ }
+ fallthrough;
+ default:
+ err = -EBADRQC;
}
if (unlikely(err))
goto err;
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
+
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ enum bpf_map_type map_type = ri->map_type;
+
+ /* XDP_REDIRECT is not fully supported yet for xdp frags since
+ * not all XDP capable drivers can map non-linear xdp_frame in
+ * ndo_xdp_xmit.
+ */
+ if (unlikely(xdp_buff_has_frags(xdp) &&
+ map_type != BPF_MAP_TYPE_CPUMAP))
+ return -EOPNOTSUPP;
+
+ if (map_type == BPF_MAP_TYPE_XSKMAP)
+ return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
+
+ return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
+ xdp_prog);
+}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
+int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
+ struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ enum bpf_map_type map_type = ri->map_type;
+
+ if (map_type == BPF_MAP_TYPE_XSKMAP)
+ return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
+
+ return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
+}
+EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
+
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
struct bpf_prog *xdp_prog,
- struct bpf_map *map)
+ void *fwd,
+ enum bpf_map_type map_type, u32 map_id)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- u32 index = ri->tgt_index;
- void *fwd = ri->tgt_value;
- int err = 0;
-
- ri->tgt_index = 0;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
-
- if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
- map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- struct bpf_dtab_netdev *dst = fwd;
+ struct bpf_map *map;
+ int err;
- err = dev_map_generic_redirect(dst, skb, xdp_prog);
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ map = READ_ONCE(ri->map);
+ if (unlikely(map)) {
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+ ri->flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+ }
if (unlikely(err))
goto err;
- } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
- struct xdp_sock *xs = fwd;
-
- err = xsk_generic_rcv(xs, xdp);
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ err = xsk_generic_rcv(fwd, xdp);
if (err)
goto err;
consume_skb(skb);
- } else {
- /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_generic_redirect(fwd, skb);
+ if (unlikely(err))
+ goto err;
+ break;
+ default:
err = -EBADRQC;
goto err;
}
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
@@ -3599,34 +4354,36 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->tgt_index;
- struct net_device *fwd;
- int err = 0;
-
- if (map)
- return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
- map);
- ri->tgt_index = 0;
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ int err;
- err = xdp_ok_fwd_dev(fwd, skb->len);
- if (unlikely(err))
- goto err;
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- skb->dev = fwd;
- _trace_xdp_redirect(dev, xdp_prog, index);
- generic_xdp_tx(skb, xdp_prog);
- return 0;
+ if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = xdp_ok_fwd_dev(fwd, skb->len);
+ if (unlikely(err))
+ goto err;
+
+ skb->dev = fwd;
+ _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
+ generic_xdp_tx(skb, xdp_prog);
+ return 0;
+ }
+
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
return err;
}
-EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
@@ -3635,10 +4392,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
if (unlikely(flags))
return XDP_ABORTED;
- ri->flags = flags;
+ /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
+ * by map_idr) is used for ifindex based XDP redirect.
+ */
ri->tgt_index = ifindex;
- ri->tgt_value = NULL;
- WRITE_ONCE(ri->map, NULL);
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
return XDP_REDIRECT;
}
@@ -3654,28 +4413,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-
- /* Lower bits of the flags are used as return code on lookup failure */
- if (unlikely(flags > XDP_TX))
- return XDP_ABORTED;
-
- ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
- if (unlikely(!ri->tgt_value)) {
- /* If the lookup fails we want to clear out the state in the
- * redirect_info struct completely, so that if an eBPF program
- * performs multiple lookups, the last one always takes
- * precedence.
- */
- WRITE_ONCE(ri->map, NULL);
- return flags;
- }
-
- ri->flags = flags;
- ri->tgt_index = ifindex;
- WRITE_ONCE(ri->map, map);
-
- return XDP_REDIRECT;
+ return map->ops->map_redirect(map, ifindex, flags);
}
static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
@@ -3721,21 +4459,22 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
-static int bpf_skb_output_btf_ids[5];
+BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)
+
const struct bpf_func_proto bpf_skb_output_proto = {
.func = bpf_skb_event_output,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_skb_output_btf_ids[0],
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
- .btf_id = bpf_skb_output_btf_ids,
};
static unsigned short bpf_tunnel_key_af(u64 flags)
@@ -3751,7 +4490,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
void *to_orig = to;
int err;
- if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
+ BPF_F_TUNINFO_FLAGS)))) {
err = -EINVAL;
goto err_clear;
}
@@ -3762,6 +4502,7 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
err = -EINVAL;
switch (size) {
+ case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
case offsetof(struct bpf_tunnel_key, tunnel_label):
case offsetof(struct bpf_tunnel_key, tunnel_ext):
goto set_compat;
@@ -3782,15 +4523,22 @@ set_compat:
to->tunnel_id = be64_to_cpu(info->key.tun_id);
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
- to->tunnel_ext = 0;
+ if (flags & BPF_F_TUNINFO_FLAGS)
+ to->tunnel_flags = info->key.tun_flags;
+ else
+ to->tunnel_ext = 0;
if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
sizeof(to->remote_ipv6));
+ memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
+ sizeof(to->local_ipv6));
to->tunnel_label = be32_to_cpu(info->key.label);
} else {
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
+ memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
to->tunnel_label = 0;
}
@@ -3861,6 +4609,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
return -EINVAL;
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
switch (size) {
+ case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
case offsetof(struct bpf_tunnel_key, tunnel_label):
case offsetof(struct bpf_tunnel_key, tunnel_ext):
case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
@@ -3903,10 +4652,14 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
info->mode |= IP_TUNNEL_INFO_IPV6;
memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
sizeof(from->remote_ipv6));
+ memcpy(&info->key.u.ipv6.src, from->local_ipv6,
+ sizeof(from->local_ipv6));
info->key.label = cpu_to_be32(from->tunnel_label) &
IPV6_FLOWLABEL_MASK;
} else {
info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+ info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
+ info->key.flow_flags = FLOWI_FLAG_ANYSRC;
}
return 0;
@@ -3917,7 +4670,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -3943,7 +4696,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -4002,11 +4755,11 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
};
#ifdef CONFIG_SOCK_CGROUP_DATA
-BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
- struct sock *sk = skb_to_full_sk(skb);
struct cgroup *cgrp;
+ sk = sk_to_full_sk(sk);
if (!sk || !sk_fullsock(sk))
return 0;
@@ -4014,6 +4767,11 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
return cgroup_id(cgrp);
}
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+ return __bpf_sk_cgroup_id(skb->sk);
+}
+
static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
.func = bpf_skb_cgroup_id,
.gpl_only = false,
@@ -4021,13 +4779,13 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
- ancestor_level)
+static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
+ int ancestor_level)
{
- struct sock *sk = skb_to_full_sk(skb);
struct cgroup *ancestor;
struct cgroup *cgrp;
+ sk = sk_to_full_sk(sk);
if (!sk || !sk_fullsock(sk))
return 0;
@@ -4039,6 +4797,12 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
return cgroup_id(ancestor);
}
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+ ancestor_level)
+{
+ return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
+}
+
static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.func = bpf_skb_ancestor_cgroup_id,
.gpl_only = false,
@@ -4046,12 +4810,39 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
};
+
+BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
+{
+ return __bpf_sk_cgroup_id(sk);
+}
+
+static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
+ .func = bpf_sk_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
+BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
+{
+ return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+}
+
+static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
+ .func = bpf_sk_ancestor_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+};
#endif
-static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
+static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
unsigned long off, unsigned long len)
{
- memcpy(dst_buff, src_buff + off, len);
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+
+ bpf_xdp_copy_buf(xdp, off, dst, len, false);
return 0;
}
@@ -4062,10 +4853,11 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
- if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+
+ if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
return -EFAULT;
- return bpf_event_output(map, flags, meta, meta_size, xdp->data,
+ return bpf_event_output(map, flags, meta, meta_size, xdp,
xdp_size, bpf_xdp_copy);
}
@@ -4076,13 +4868,27 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)
+
+const struct bpf_func_proto bpf_xdp_output_proto = {
+ .func = bpf_xdp_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_xdp_output_btf_ids[0],
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
- return skb->sk ? sock_gen_cookie(skb->sk) : 0;
+ return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
}
static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
@@ -4094,7 +4900,7 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
- return sock_gen_cookie(ctx->sk);
+ return __sock_gen_cookie(ctx->sk);
}
static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
@@ -4104,9 +4910,33 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
+{
+ return __sock_gen_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
+ .func = bpf_get_socket_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
+{
+ return sk ? sock_gen_cookie(sk) : 0;
+}
+
+const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
+ .func = bpf_get_socket_ptr_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
- return sock_gen_cookie(ctx->sk);
+ return __sock_gen_cookie(ctx->sk);
}
static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
@@ -4116,6 +4946,61 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static u64 __bpf_get_netns_cookie(struct sock *sk)
+{
+ const struct net *net = sk ? sock_net(sk) : &init_net;
+
+ return net->net_cookie;
+}
+
+BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
+ .func = bpf_get_netns_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
+ .func = bpf_get_netns_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
+ .func = bpf_get_netns_cookie_sock_ops,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
+ .func = bpf_get_netns_cookie_sk_msg,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
struct sock *sk = sk_to_full_sk(skb->sk);
@@ -4134,262 +5019,530 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
- struct bpf_map *, map, u64, flags, void *, data, u64, size)
-{
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+static int sol_socket_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ switch (optname) {
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ case SO_KEEPALIVE:
+ case SO_PRIORITY:
+ case SO_REUSEPORT:
+ case SO_RCVLOWAT:
+ case SO_MARK:
+ case SO_MAX_PACING_RATE:
+ case SO_BINDTOIFINDEX:
+ case SO_TXREHASH:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ case SO_BINDTODEVICE:
+ break;
+ default:
return -EINVAL;
+ }
- return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
-}
+ if (getopt) {
+ if (optname == SO_BINDTODEVICE)
+ return -EINVAL;
+ return sk_getsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
-static const struct bpf_func_proto bpf_sockopt_event_output_proto = {
- .func = bpf_sockopt_event_output,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
- .arg5_type = ARG_CONST_SIZE_OR_ZERO,
-};
+ return sk_setsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
-BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
- int, level, int, optname, char *, optval, int, optlen)
+static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
+ char *optval, int optlen)
{
- struct sock *sk = bpf_sock->sk;
- int ret = 0;
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
int val;
- if (!sk_fullsock(sk))
+ if (optlen != sizeof(int))
return -EINVAL;
- if (level == SOL_SOCKET) {
- if (optlen != sizeof(int))
+ val = *(int *)optval;
+
+ /* Only some options are supported */
+ switch (optname) {
+ case TCP_BPF_IW:
+ if (val <= 0 || tp->data_segs_out > tp->syn_data)
return -EINVAL;
- val = *((int *)optval);
-
- /* Only some socketops are supported */
- switch (optname) {
- case SO_RCVBUF:
- val = min_t(u32, val, sysctl_rmem_max);
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- WRITE_ONCE(sk->sk_rcvbuf,
- max_t(int, val * 2, SOCK_MIN_RCVBUF));
- break;
- case SO_SNDBUF:
- val = min_t(u32, val, sysctl_wmem_max);
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- WRITE_ONCE(sk->sk_sndbuf,
- max_t(int, val * 2, SOCK_MIN_SNDBUF));
- break;
- case SO_MAX_PACING_RATE: /* 32bit version */
- if (val != ~0U)
- cmpxchg(&sk->sk_pacing_status,
- SK_PACING_NONE,
- SK_PACING_NEEDED);
- sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
- sk->sk_pacing_rate = min(sk->sk_pacing_rate,
- sk->sk_max_pacing_rate);
- break;
- case SO_PRIORITY:
- sk->sk_priority = val;
- break;
- case SO_RCVLOWAT:
- if (val < 0)
- val = INT_MAX;
- WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
- break;
- case SO_MARK:
- if (sk->sk_mark != val) {
- sk->sk_mark = val;
- sk_dst_reset(sk);
- }
- break;
- default:
- ret = -EINVAL;
- }
-#ifdef CONFIG_INET
- } else if (level == SOL_IP) {
- if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+ tcp_snd_cwnd_set(tp, val);
+ break;
+ case TCP_BPF_SNDCWND_CLAMP:
+ if (val <= 0)
return -EINVAL;
+ tp->snd_cwnd_clamp = val;
+ tp->snd_ssthresh = val;
+ break;
+ case TCP_BPF_DELACK_MAX:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_DELACK_MAX ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_delack_max = timeout;
+ break;
+ case TCP_BPF_RTO_MIN:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_RTO_MIN ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_rto_min = timeout;
+ break;
+ default:
+ return -EINVAL;
+ }
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case IP_TOS:
- if (val < -1 || val > 0xff) {
- ret = -EINVAL;
- } else {
- struct inet_sock *inet = inet_sk(sk);
+ return 0;
+}
- if (val == -1)
- val = 0;
- inet->tos = val;
- }
- break;
- default:
- ret = -EINVAL;
- }
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (level == SOL_IPV6) {
- if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
+ int *optlen, bool getopt)
+{
+ struct tcp_sock *tp;
+ int ret;
+
+ if (*optlen < 2)
+ return -EINVAL;
+
+ if (getopt) {
+ if (!inet_csk(sk)->icsk_ca_ops)
return -EINVAL;
+ /* BPF expects NULL-terminated tcp-cc string */
+ optval[--(*optlen)] = '\0';
+ return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case IPV6_TCLASS:
- if (val < -1 || val > 0xff) {
- ret = -EINVAL;
- } else {
- struct ipv6_pinfo *np = inet6_sk(sk);
+ /* "cdg" is the only cc that alloc a ptr
+ * in inet_csk_ca area. The bpf-tcp-cc may
+ * overwrite this ptr after switching to cdg.
+ */
+ if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
+ return -ENOTSUPP;
- if (val == -1)
- val = 0;
- np->tclass = val;
- }
- break;
- default:
- ret = -EINVAL;
- }
-#endif
- } else if (level == SOL_TCP &&
- sk->sk_prot->setsockopt == tcp_setsockopt) {
- if (optname == TCP_CONGESTION) {
- char name[TCP_CA_NAME_MAX];
- bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
-
- strncpy(name, optval, min_t(long, optlen,
- TCP_CA_NAME_MAX-1));
- name[TCP_CA_NAME_MAX-1] = 0;
- ret = tcp_set_congestion_control(sk, name, false,
- reinit, true);
- } else {
+ /* It stops this looping
+ *
+ * .init => bpf_setsockopt(tcp_cc) => .init =>
+ * bpf_setsockopt(tcp_cc)" => .init => ....
+ *
+ * The second bpf_setsockopt(tcp_cc) is not allowed
+ * in order to break the loop when both .init
+ * are the same bpf prog.
+ *
+ * This applies even the second bpf_setsockopt(tcp_cc)
+ * does not cause a loop. This limits only the first
+ * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
+ * pick a fallback cc (eg. peer does not support ECN)
+ * and the second '.init' cannot fallback to
+ * another.
+ */
+ tp = tcp_sk(sk);
+ if (tp->bpf_chg_cc_inprogress)
+ return -EBUSY;
+
+ tp->bpf_chg_cc_inprogress = 1;
+ ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+ KERNEL_SOCKPTR(optval), *optlen);
+ tp->bpf_chg_cc_inprogress = 0;
+ return ret;
+}
+
+static int sol_tcp_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_prot->setsockopt != tcp_setsockopt)
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_NODELAY:
+ case TCP_MAXSEG:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_KEEPCNT:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ case TCP_USER_TIMEOUT:
+ case TCP_NOTSENT_LOWAT:
+ case TCP_SAVE_SYN:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ case TCP_CONGESTION:
+ return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
+ case TCP_SAVED_SYN:
+ if (*optlen < 1)
+ return -EINVAL;
+ break;
+ default:
+ if (getopt)
+ return -EINVAL;
+ return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
+ }
+
+ if (getopt) {
+ if (optname == TCP_SAVED_SYN) {
struct tcp_sock *tp = tcp_sk(sk);
- if (optlen != sizeof(int))
+ if (!tp->saved_syn ||
+ *optlen > tcp_saved_syn_len(tp->saved_syn))
return -EINVAL;
-
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case TCP_BPF_IW:
- if (val <= 0 || tp->data_segs_out > tp->syn_data)
- ret = -EINVAL;
- else
- tp->snd_cwnd = val;
- break;
- case TCP_BPF_SNDCWND_CLAMP:
- if (val <= 0) {
- ret = -EINVAL;
- } else {
- tp->snd_cwnd_clamp = val;
- tp->snd_ssthresh = val;
- }
- break;
- case TCP_SAVE_SYN:
- if (val < 0 || val > 1)
- ret = -EINVAL;
- else
- tp->save_syn = val;
- break;
- default:
- ret = -EINVAL;
- }
+ memcpy(optval, tp->saved_syn->data, *optlen);
+ /* It cannot free tp->saved_syn here because it
+ * does not know if the user space still needs it.
+ */
+ return 0;
}
-#endif
- } else {
- ret = -EINVAL;
+
+ return do_tcp_getsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
}
- return ret;
+
+ return do_tcp_setsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int sol_ip_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_family != AF_INET)
+ return -EINVAL;
+
+ switch (optname) {
+ case IP_TOS:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (getopt)
+ return do_ip_getsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+
+ return do_ip_setsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int sol_ipv6_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ switch (optname) {
+ case IPV6_TCLASS:
+ case IPV6_AUTOFLOWLABEL:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (getopt)
+ return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+
+ return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int __bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (!sk_fullsock(sk))
+ return -EINVAL;
+
+ if (level == SOL_SOCKET)
+ return sol_socket_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ return sol_ip_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ return sol_tcp_sockopt(sk, optname, optval, &optlen, false);
+
+ return -EINVAL;
+}
+
+static int _bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+static int __bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ int err, saved_optlen = optlen;
+
+ if (!sk_fullsock(sk)) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (level == SOL_SOCKET)
+ err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
+ else
+ err = -EINVAL;
+
+done:
+ if (err)
+ optlen = 0;
+ if (optlen < saved_optlen)
+ memset(optval + optlen, 0, saved_optlen - optlen);
+ return err;
+}
+
+static int _bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+ return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_proto = {
+ .func = bpf_sk_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return _bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_getsockopt_proto = {
+ .func = bpf_sk_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
+ .func = bpf_unlocked_sk_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
+ .func = bpf_unlocked_sk_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
}
-static const struct bpf_func_proto bpf_setsockopt_proto = {
- .func = bpf_setsockopt,
+static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
+ .func = bpf_sock_addr_setsockopt,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE,
};
-BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
int, level, int, optname, char *, optval, int, optlen)
{
- struct sock *sk = bpf_sock->sk;
+ return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
+}
- if (!sk_fullsock(sk))
- goto err_clear;
-#ifdef CONFIG_INET
- if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
- struct inet_connection_sock *icsk;
- struct tcp_sock *tp;
+static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
+ .func = bpf_sock_addr_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
- switch (optname) {
- case TCP_CONGESTION:
- icsk = inet_csk(sk);
+BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
+}
- if (!icsk->icsk_ca_ops || optlen <= 1)
- goto err_clear;
- strncpy(optval, icsk->icsk_ca_ops->name, optlen);
- optval[optlen - 1] = 0;
- break;
- case TCP_SAVED_SYN:
- tp = tcp_sk(sk);
+static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
+ .func = bpf_sock_ops_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
- if (optlen <= 0 || !tp->saved_syn ||
- optlen > tp->saved_syn[0])
- goto err_clear;
- memcpy(optval, tp->saved_syn + 1, optlen);
- break;
- default:
- goto err_clear;
+static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
+ int optname, const u8 **start)
+{
+ struct sk_buff *syn_skb = bpf_sock->syn_skb;
+ const u8 *hdr_start;
+ int ret;
+
+ if (syn_skb) {
+ /* sk is a request_sock here */
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = syn_skb->data;
+ ret = tcp_hdrlen(syn_skb);
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = skb_network_header(syn_skb);
+ ret = skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+ hdr_start = skb_mac_header(syn_skb);
+ ret = skb_mac_header_len(syn_skb) +
+ skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
}
- } else if (level == SOL_IP) {
- struct inet_sock *inet = inet_sk(sk);
+ } else {
+ struct sock *sk = bpf_sock->sk;
+ struct saved_syn *saved_syn;
- if (optlen != sizeof(int) || sk->sk_family != AF_INET)
- goto err_clear;
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ /* synack retransmit. bpf_sock->syn_skb will
+ * not be available. It has to resort to
+ * saved_syn (if it is saved).
+ */
+ saved_syn = inet_reqsk(sk)->saved_syn;
+ else
+ saved_syn = tcp_sk(sk)->saved_syn;
- /* Only some options are supported */
- switch (optname) {
- case IP_TOS:
- *((int *)optval) = (int)inet->tos;
- break;
- default:
- goto err_clear;
+ if (!saved_syn)
+ return -ENOENT;
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen;
+ ret = saved_syn->tcp_hdrlen;
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen;
+ ret = saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+
+ /* TCP_SAVE_SYN may not have saved the mac hdr */
+ if (!saved_syn->mac_hdrlen)
+ return -ENOENT;
+
+ hdr_start = saved_syn->data;
+ ret = saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
}
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (level == SOL_IPV6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
+ }
- if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
- goto err_clear;
+ *start = hdr_start;
+ return ret;
+}
- /* Only some options are supported */
- switch (optname) {
- case IPV6_TCLASS:
- *((int *)optval) = (int)np->tclass;
- break;
- default:
- goto err_clear;
+BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
+ optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
+ int ret, copy_len = 0;
+ const u8 *start;
+
+ ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
+ if (ret > 0) {
+ copy_len = ret;
+ if (optlen < copy_len) {
+ copy_len = optlen;
+ ret = -ENOSPC;
+ }
+
+ memcpy(optval, start, copy_len);
}
-#endif
- } else {
- goto err_clear;
+
+ /* Zero out unused buffer at the end */
+ memset(optval + copy_len, 0, optlen - copy_len);
+
+ return ret;
}
- return 0;
-#endif
-err_clear:
- memset(optval, 0, optlen);
- return -EINVAL;
+
+ return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
-static const struct bpf_func_proto bpf_getsockopt_proto = {
- .func = bpf_getsockopt,
+static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
+ .func = bpf_sock_ops_getsockopt,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -4429,30 +5582,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
{
#ifdef CONFIG_INET
struct sock *sk = ctx->sk;
+ u32 flags = BIND_FROM_BPF;
int err;
- /* Binding to port can be expensive so it's prohibited in the helper.
- * Only binding to IP is supported.
- */
err = -EINVAL;
if (addr_len < offsetofend(struct sockaddr, sa_family))
return err;
if (addr->sa_family == AF_INET) {
if (addr_len < sizeof(struct sockaddr_in))
return err;
- if (((struct sockaddr_in *)addr)->sin_port != htons(0))
- return err;
- return __inet_bind(sk, addr, addr_len, true, false);
+ if (((struct sockaddr_in *)addr)->sin_port == htons(0))
+ flags |= BIND_FORCE_ADDRESS_NO_PORT;
+ return __inet_bind(sk, addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
} else if (addr->sa_family == AF_INET6) {
if (addr_len < SIN6_LEN_RFC2133)
return err;
- if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
- return err;
+ if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
+ flags |= BIND_FORCE_ADDRESS_NO_PORT;
/* ipv6_bpf_stub cannot be NULL, since it's called from
* bpf_cgroup_inet6_connect hook and ipv6 is already loaded
*/
- return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false);
+ return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
#endif /* CONFIG_IPV6 */
}
#endif /* CONFIG_INET */
@@ -4465,7 +5616,7 @@ static const struct bpf_func_proto bpf_bind_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -4518,13 +5669,14 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
const struct neighbour *neigh,
- const struct net_device *dev)
+ const struct net_device *dev, u32 mtu)
{
memcpy(params->dmac, neigh->ha, ETH_ALEN);
memcpy(params->smac, dev->dev_addr, ETH_ALEN);
params->h_vlan_TCI = 0;
params->h_vlan_proto = 0;
- params->ifindex = dev->ifindex;
+ if (mtu)
+ params->mtu_result = mtu; /* union with tot_len */
return 0;
}
@@ -4540,8 +5692,8 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
struct net_device *dev;
struct fib_result res;
struct flowi4 fl4;
+ u32 mtu = 0;
int err;
- u32 mtu;
dev = dev_get_by_index_rcu(net, params->ifindex);
if (unlikely(!dev))
@@ -4568,6 +5720,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
fl4.saddr = params->ipv4_src;
fl4.fl4_sport = params->sport;
fl4.fl4_dport = params->dport;
+ fl4.flowi4_multipath_hash = 0;
if (flags & BPF_FIB_LOOKUP_DIRECT) {
u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
@@ -4607,8 +5760,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (check_mtu) {
mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
- if (params->tot_len > mtu)
+ if (params->tot_len > mtu) {
+ params->mtu_result = mtu; /* union with tot_len */
return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
}
nhc = res.nhc;
@@ -4620,6 +5775,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
dev = nhc->nhc_dev;
params->rt_metric = res.fi->fib_priority;
+ params->ifindex = dev->ifindex;
/* xdp and cls_bpf programs are run in RCU-bh so
* rcu_read_lock_bh is not needed here
@@ -4641,7 +5797,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (!neigh)
return BPF_FIB_LKUP_RET_NO_NEIGH;
- return bpf_fib_set_fwd_params(params, neigh, dev);
+ return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
}
#endif
@@ -4658,7 +5814,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
struct flowi6 fl6;
int strict = 0;
int oif, err;
- u32 mtu;
+ u32 mtu = 0;
/* link local addresses are never forwarded */
if (rt6_need_strict(dst) || rt6_need_strict(src))
@@ -4733,8 +5889,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (check_mtu) {
mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
- if (params->tot_len > mtu)
+ if (params->tot_len > mtu) {
+ params->mtu_result = mtu; /* union with tot_len */
return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
}
if (res.nh->fib_nh_lws)
@@ -4745,6 +5903,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
dev = res.nh->fib_nh_dev;
params->rt_metric = res.f6i->fib6_metric;
+ params->ifindex = dev->ifindex;
/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
* not needed here.
@@ -4753,7 +5912,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (!neigh)
return BPF_FIB_LKUP_RET_NO_NEIGH;
- return bpf_fib_set_fwd_params(params, neigh, dev);
+ return bpf_fib_set_fwd_params(params, neigh, dev, mtu);
}
#endif
@@ -4796,6 +5955,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
{
struct net *net = dev_net(skb->dev);
int rc = -EAFNOSUPPORT;
+ bool check_mtu = false;
if (plen < sizeof(*params))
return -EINVAL;
@@ -4803,25 +5963,33 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
return -EINVAL;
+ if (params->tot_len)
+ check_mtu = true;
+
switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
case AF_INET:
- rc = bpf_ipv4_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- rc = bpf_ipv6_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
break;
#endif
}
- if (!rc) {
+ if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
struct net_device *dev;
+ /* When tot_len isn't provided by user, check skb
+ * against MTU of FIB lookup resulting net_device
+ */
dev = dev_get_by_index_rcu(net, params->ifindex);
if (!is_skb_forwardable(dev, skb))
rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
+
+ params->mtu_result = dev->mtu; /* union with tot_len */
}
return rc;
@@ -4837,13 +6005,131 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
.arg4_type = ARG_ANYTHING,
};
+static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
+ u32 ifindex)
+{
+ struct net *netns = dev_net(dev_curr);
+
+ /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
+ if (ifindex == 0)
+ return dev_curr;
+
+ return dev_get_by_index_rcu(netns, ifindex);
+}
+
+BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
+ u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+{
+ int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+ struct net_device *dev = skb->dev;
+ int skb_len, dev_len;
+ int mtu;
+
+ if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
+ return -EINVAL;
+
+ if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
+ return -EINVAL;
+
+ dev = __dev_via_ifindex(dev, ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ mtu = READ_ONCE(dev->mtu);
+
+ dev_len = mtu + dev->hard_header_len;
+
+ /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+ skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
+
+ skb_len += len_diff; /* minus result pass check */
+ if (skb_len <= dev_len) {
+ ret = BPF_MTU_CHK_RET_SUCCESS;
+ goto out;
+ }
+ /* At this point, skb->len exceed MTU, but as it include length of all
+ * segments, it can still be below MTU. The SKB can possibly get
+ * re-segmented in transmit path (see validate_xmit_skb). Thus, user
+ * must choose if segs are to be MTU checked.
+ */
+ if (skb_is_gso(skb)) {
+ ret = BPF_MTU_CHK_RET_SUCCESS;
+
+ if (flags & BPF_MTU_CHK_SEGS &&
+ !skb_gso_validate_network_len(skb, mtu))
+ ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
+ }
+out:
+ /* BPF verifier guarantees valid pointer */
+ *mtu_len = mtu;
+
+ return ret;
+}
+
+BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
+ u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+{
+ struct net_device *dev = xdp->rxq->dev;
+ int xdp_len = xdp->data_end - xdp->data;
+ int ret = BPF_MTU_CHK_RET_SUCCESS;
+ int mtu, dev_len;
+
+ /* XDP variant doesn't support multi-buffer segment check (yet) */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ dev = __dev_via_ifindex(dev, ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ mtu = READ_ONCE(dev->mtu);
+
+ /* Add L2-header as dev MTU is L3 size */
+ dev_len = mtu + dev->hard_header_len;
+
+ /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+ if (*mtu_len)
+ xdp_len = *mtu_len + dev->hard_header_len;
+
+ xdp_len += len_diff; /* minus result pass check */
+ if (xdp_len > dev_len)
+ ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+
+ /* BPF verifier guarantees valid pointer */
+ *mtu_len = mtu;
+
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
+ .func = bpf_skb_check_mtu,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_INT,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
+ .func = bpf_xdp_check_mtu,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_INT,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
{
int err;
struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
- if (!seg6_validate_srh(srh, len))
+ if (!seg6_validate_srh(srh, len, false))
return -EINVAL;
switch (type) {
@@ -4866,7 +6152,6 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
if (err)
return err;
- ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
return seg6_lookup_nexthop(skb, NULL, 0);
@@ -4918,7 +6203,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE
};
@@ -4928,7 +6213,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE
};
@@ -4971,7 +6256,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE
};
@@ -5059,7 +6344,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE
};
@@ -5127,6 +6412,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
{
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
bool refcounted = false;
struct sock *sk = NULL;
@@ -5135,7 +6421,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
__be32 dst4 = tuple->ipv4.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0,
+ sk = __inet_lookup(net, hinfo, NULL, 0,
src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &refcounted);
@@ -5149,7 +6435,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0,
+ sk = __inet6_lookup(net, hinfo, NULL, 0,
src6, tuple->ipv6.sport,
dst6, ntohs(tuple->ipv6.dport),
dif, sdif, &refcounted);
@@ -5171,8 +6457,6 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
/* bpf_skc_lookup performs the core lookup for different types of sockets,
* taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
- * Returns the socket as an 'unsigned long' to simplify the casting in the
- * callers to satisfy BPF_CALL declarations.
*/
static struct sock *
__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
@@ -5180,8 +6464,8 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
u64 flags)
{
struct sock *sk = NULL;
- u8 family = AF_UNSPEC;
struct net *net;
+ u8 family;
int sdif;
if (len == sizeof(tuple->ipv4))
@@ -5191,8 +6475,7 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
else
return NULL;
- if (unlikely(family == AF_UNSPEC || flags ||
- !((s32)netns_id < 0 || netns_id <= S32_MAX)))
+ if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
goto out;
if (family == AF_INET)
@@ -5224,10 +6507,21 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
ifindex, proto, netns_id, flags);
if (sk) {
- sk = sk_to_full_sk(sk);
- if (!sk_fullsock(sk)) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (!sk_fullsock(sk2))
+ sk2 = NULL;
+ if (sk2 != sk) {
sock_gen_put(sk);
- return NULL;
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
}
}
@@ -5261,10 +6555,21 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
flags);
if (sk) {
- sk = sk_to_full_sk(sk);
- if (!sk_fullsock(sk)) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (!sk_fullsock(sk2))
+ sk2 = NULL;
+ if (sk2 != sk) {
sock_gen_put(sk);
- return NULL;
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
}
}
@@ -5284,7 +6589,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
.pkt_access = true,
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
@@ -5303,7 +6608,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
.pkt_access = true,
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
@@ -5322,7 +6627,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
.pkt_access = true,
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
@@ -5330,8 +6635,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- /* Only full sockets have sk->sk_flags. */
- if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
+ if (sk && sk_is_refcounted(sk))
sock_gen_put(sk);
return 0;
}
@@ -5340,7 +6644,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = {
.func = bpf_sk_release,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE,
};
BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
@@ -5360,7 +6664,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
.pkt_access = true,
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
@@ -5383,7 +6687,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
.pkt_access = true,
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
@@ -5406,7 +6710,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
.pkt_access = true,
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
@@ -5425,7 +6729,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
@@ -5444,7 +6748,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
@@ -5463,7 +6767,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
@@ -5648,12 +6952,16 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
{
unsigned int iphdr_len;
- if (skb->protocol == cpu_to_be16(ETH_P_IP))
+ switch (skb_protocol(skb, true)) {
+ case cpu_to_be16(ETH_P_IP):
iphdr_len = sizeof(struct iphdr);
- else if (skb->protocol == cpu_to_be16(ETH_P_IPV6))
+ break;
+ case cpu_to_be16(ETH_P_IPV6):
iphdr_len = sizeof(struct ipv6hdr);
- else
+ break;
+ default:
return 0;
+ }
if (skb_headlen(skb) < iphdr_len)
return 0;
@@ -5718,37 +7026,46 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len
u32 cookie;
int ret;
- if (unlikely(th_len < sizeof(*th)))
+ if (unlikely(!sk || th_len < sizeof(*th)))
return -EINVAL;
/* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
return -EINVAL;
- if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
return -EINVAL;
if (!th->ack || th->rst || th->syn)
return -ENOENT;
+ if (unlikely(iph_len < sizeof(struct iphdr)))
+ return -EINVAL;
+
if (tcp_synq_no_recent_overflow(sk))
return -ENOENT;
cookie = ntohl(th->ack_seq) - 1;
- switch (sk->sk_family) {
- case AF_INET:
- if (unlikely(iph_len < sizeof(struct iphdr)))
+ /* Both struct iphdr and struct ipv6hdr have the version field at the
+ * same offset so we can cast to the shorter header (struct iphdr).
+ */
+ switch (((struct iphdr *)iph)->version) {
+ case 4:
+ if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
return -EINVAL;
ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
break;
#if IS_BUILTIN(CONFIG_IPV6)
- case AF_INET6:
+ case 6:
if (unlikely(iph_len < sizeof(struct ipv6hdr)))
return -EINVAL;
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
break;
#endif /* CONFIG_IPV6 */
@@ -5771,10 +7088,10 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
.gpl_only = true,
.pkt_access = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_SOCK_COMMON,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE,
};
@@ -5785,13 +7102,13 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
u32 cookie;
u16 mss;
- if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+ if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
return -EINVAL;
if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
return -EINVAL;
- if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
return -ENOENT;
if (!th->syn || th->ack || th->fin || th->rst)
@@ -5805,7 +7122,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
*/
switch (((struct iphdr *)iph)->version) {
case 4:
- if (sk->sk_family == AF_INET6 && sk->sk_ipv6only)
+ if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
return -EINVAL;
mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
@@ -5840,13 +7157,414 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
.gpl_only = true, /* __cookie_v*_init_sequence() is GPL */
.pkt_access = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_SOCK_COMMON,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE,
};
+BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
+{
+ if (!sk || flags != 0)
+ return -EINVAL;
+ if (!skb_at_tc_ingress(skb))
+ return -EOPNOTSUPP;
+ if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+ return -ENETUNREACH;
+ if (unlikely(sk_fullsock(sk) && sk->sk_reuseport))
+ return -ESOCKTNOSUPPORT;
+ if (sk_is_refcounted(sk) &&
+ unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ return -ENOENT;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_pfree;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_assign_proto = {
+ .func = bpf_sk_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
+ u8 search_kind, const u8 *magic,
+ u8 magic_len, bool *eol)
+{
+ u8 kind, kind_len;
+
+ *eol = false;
+
+ while (op < opend) {
+ kind = op[0];
+
+ if (kind == TCPOPT_EOL) {
+ *eol = true;
+ return ERR_PTR(-ENOMSG);
+ } else if (kind == TCPOPT_NOP) {
+ op++;
+ continue;
+ }
+
+ if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
+ /* Something is wrong in the received header.
+ * Follow the TCP stack's tcp_parse_options()
+ * and just bail here.
+ */
+ return ERR_PTR(-EFAULT);
+
+ kind_len = op[1];
+ if (search_kind == kind) {
+ if (!magic_len)
+ return op;
+
+ if (magic_len > kind_len - 2)
+ return ERR_PTR(-ENOMSG);
+
+ if (!memcmp(&op[2], magic, magic_len))
+ return op;
+ }
+
+ op += kind_len;
+ }
+
+ return ERR_PTR(-ENOMSG);
+}
+
+BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ void *, search_res, u32, len, u64, flags)
+{
+ bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
+ const u8 *op, *opend, *magic, *search = search_res;
+ u8 search_kind, search_len, copy_len, magic_len;
+ int ret;
+
+ /* 2 byte is the minimal option len except TCPOPT_NOP and
+ * TCPOPT_EOL which are useless for the bpf prog to learn
+ * and this helper disallow loading them also.
+ */
+ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
+ return -EINVAL;
+
+ search_kind = search[0];
+ search_len = search[1];
+
+ if (search_len > len || search_kind == TCPOPT_NOP ||
+ search_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (search_kind == TCPOPT_EXP || search_kind == 253) {
+ /* 16 or 32 bit magic. +2 for kind and kind length */
+ if (search_len != 4 && search_len != 6)
+ return -EINVAL;
+ magic = &search[2];
+ magic_len = search_len - 2;
+ } else {
+ if (search_len)
+ return -EINVAL;
+ magic = NULL;
+ magic_len = 0;
+ }
+
+ if (load_syn) {
+ ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
+ if (ret < 0)
+ return ret;
+
+ opend = op + ret;
+ op += sizeof(struct tcphdr);
+ } else {
+ if (!bpf_sock->skb ||
+ bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ /* This bpf_sock->op cannot call this helper */
+ return -EPERM;
+
+ opend = bpf_sock->skb_data_end;
+ op = bpf_sock->skb->data + sizeof(struct tcphdr);
+ }
+
+ op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
+ &eol);
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+
+ copy_len = op[1];
+ ret = copy_len;
+ if (copy_len > len) {
+ ret = -ENOSPC;
+ copy_len = len;
+ }
+
+ memcpy(search_res, op, copy_len);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
+ .func = bpf_sock_ops_load_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ const void *, from, u32, len, u64, flags)
+{
+ u8 new_kind, new_kind_len, magic_len = 0, *opend;
+ const u8 *op, *new_op, *magic = NULL;
+ struct sk_buff *skb;
+ bool eol;
+
+ if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
+ return -EPERM;
+
+ if (len < 2 || flags)
+ return -EINVAL;
+
+ new_op = from;
+ new_kind = new_op[0];
+ new_kind_len = new_op[1];
+
+ if (new_kind_len > len || new_kind == TCPOPT_NOP ||
+ new_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (new_kind_len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ /* 253 is another experimental kind */
+ if (new_kind == TCPOPT_EXP || new_kind == 253) {
+ if (new_kind_len < 4)
+ return -EINVAL;
+ /* Match for the 2 byte magic also.
+ * RFC 6994: the magic could be 2 or 4 bytes.
+ * Hence, matching by 2 byte only is on the
+ * conservative side but it is the right
+ * thing to do for the 'search-for-duplication'
+ * purpose.
+ */
+ magic = &new_op[2];
+ magic_len = 2;
+ }
+
+ /* Check for duplication */
+ skb = bpf_sock->skb;
+ op = skb->data + sizeof(struct tcphdr);
+ opend = bpf_sock->skb_data_end;
+
+ op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
+ &eol);
+ if (!IS_ERR(op))
+ return -EEXIST;
+
+ if (PTR_ERR(op) != -ENOMSG)
+ return PTR_ERR(op);
+
+ if (eol)
+ /* The option has been ended. Treat it as no more
+ * header option can be written.
+ */
+ return -ENOSPC;
+
+ /* No duplication found. Store the header option. */
+ memcpy(opend, from, new_kind_len);
+
+ bpf_sock->remaining_opt_len -= new_kind_len;
+ bpf_sock->skb_data_end += new_kind_len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
+ .func = bpf_sock_ops_store_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ u32, len, u64, flags)
+{
+ if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ return -EPERM;
+
+ if (flags || len < 2)
+ return -EINVAL;
+
+ if (len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ bpf_sock->remaining_opt_len -= len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
+ .func = bpf_sock_ops_reserve_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
+ u64, tstamp, u32, tstamp_type)
+{
+ /* skb_clear_delivery_time() is done for inet protocol */
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ return -EOPNOTSUPP;
+
+ switch (tstamp_type) {
+ case BPF_SKB_TSTAMP_DELIVERY_MONO:
+ if (!tstamp)
+ return -EINVAL;
+ skb->tstamp = tstamp;
+ skb->mono_delivery_time = 1;
+ break;
+ case BPF_SKB_TSTAMP_UNSPEC:
+ if (tstamp)
+ return -EINVAL;
+ skb->tstamp = 0;
+ skb->mono_delivery_time = 0;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
+ .func = bpf_skb_set_tstamp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+#ifdef CONFIG_SYN_COOKIES
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
+ struct tcphdr *, th, u32, th_len)
+{
+ u32 cookie;
+ u16 mss;
+
+ if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+ return -EINVAL;
+
+ mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
+ cookie = __cookie_v4_init_sequence(iph, th, &mss);
+
+ return cookie | ((u64)mss << 32);
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
+ .func = bpf_tcp_raw_gen_syncookie_ipv4,
+ .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct iphdr),
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
+ struct tcphdr *, th, u32, th_len)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+ const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
+ sizeof(struct ipv6hdr);
+ u32 cookie;
+ u16 mss;
+
+ if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+ return -EINVAL;
+
+ mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
+ cookie = __cookie_v6_init_sequence(iph, th, &mss);
+
+ return cookie | ((u64)mss << 32);
+#else
+ return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
+ .func = bpf_tcp_raw_gen_syncookie_ipv6,
+ .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct ipv6hdr),
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
+ struct tcphdr *, th)
+{
+ u32 cookie = ntohl(th->ack_seq) - 1;
+
+ if (__cookie_v4_check(iph, th, cookie) > 0)
+ return 0;
+
+ return -EACCES;
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
+ .func = bpf_tcp_raw_check_syncookie_ipv4,
+ .gpl_only = true, /* __cookie_v4_check is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct iphdr),
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg2_size = sizeof(struct tcphdr),
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
+ struct tcphdr *, th)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+ u32 cookie = ntohl(th->ack_seq) - 1;
+
+ if (__cookie_v6_check(iph, th, cookie) > 0)
+ return 0;
+
+ return -EACCES;
+#else
+ return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
+ .func = bpf_tcp_raw_check_syncookie_ipv6,
+ .gpl_only = true, /* __cookie_v6_check is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct ipv6hdr),
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg2_size = sizeof(struct tcphdr),
+};
+#endif /* CONFIG_SYN_COOKIES */
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -5860,6 +7578,7 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_skb_change_tail ||
func == sk_skb_change_tail ||
func == bpf_skb_adjust_room ||
+ func == sk_skb_adjust_room ||
func == bpf_skb_pull_data ||
func == sk_skb_pull_data ||
func == bpf_clone_redirect ||
@@ -5876,6 +7595,9 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_lwt_seg6_adjust_srh ||
func == bpf_lwt_seg6_action ||
#endif
+#ifdef CONFIG_INET
+ func == bpf_sock_ops_store_hdr_opt ||
+#endif
func == bpf_lwt_in_push_encap ||
func == bpf_lwt_xmit_push_encap)
return true;
@@ -5883,64 +7605,33 @@ bool bpf_helper_changes_pkt_data(void *func)
return false;
}
-const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
-{
- switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
- case BPF_FUNC_get_prandom_u32:
- return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_raw_smp_processor_id_proto;
- case BPF_FUNC_get_numa_node_id:
- return &bpf_get_numa_node_id_proto;
- case BPF_FUNC_tail_call:
- return &bpf_tail_call_proto;
- case BPF_FUNC_ktime_get_ns:
- return &bpf_ktime_get_ns_proto;
- default:
- break;
- }
-
- if (!capable(CAP_SYS_ADMIN))
- return NULL;
-
- switch (func_id) {
- case BPF_FUNC_spin_lock:
- return &bpf_spin_lock_proto;
- case BPF_FUNC_spin_unlock:
- return &bpf_spin_unlock_proto;
- case BPF_FUNC_trace_printk:
- return bpf_get_trace_printk_proto();
- case BPF_FUNC_jiffies64:
- return &bpf_jiffies64_proto;
- default:
- return NULL;
- }
-}
+const struct bpf_func_proto bpf_event_output_data_proto __weak;
+const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;
static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- /* inet and inet6 sockets are created in a process
- * context so there is always a valid uid/gid
- */
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_cg_sock_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -5949,12 +7640,17 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- /* inet and inet6 sockets are created in a process
- * context so there is always a valid uid/gid
- */
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_bind:
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_CONNECT:
@@ -5965,8 +7661,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_addr_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_addr_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sock_addr_sk_lookup_tcp_proto;
@@ -5981,8 +7679,44 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return &bpf_sock_addr_setsockopt_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_getsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ return &bpf_sock_addr_getsockopt_proto;
+ default:
+ return NULL;
+ }
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6001,7 +7735,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6011,9 +7745,13 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;
static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
case BPF_FUNC_sk_storage_get:
@@ -6025,8 +7763,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto;
+ case BPF_FUNC_skb_ancestor_cgroup_id:
+ return &bpf_skb_ancestor_cgroup_id_proto;
+ case BPF_FUNC_sk_cgroup_id:
+ return &bpf_sk_cgroup_id_proto;
+ case BPF_FUNC_sk_ancestor_cgroup_id:
+ return &bpf_sk_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_skc_lookup_tcp_proto;
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
case BPF_FUNC_get_listener_sock:
@@ -6055,6 +7807,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_csum_diff_proto;
case BPF_FUNC_csum_update:
return &bpf_csum_update_proto;
+ case BPF_FUNC_csum_level:
+ return &bpf_csum_level_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -6075,6 +7829,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_adjust_room_proto;
case BPF_FUNC_skb_change_tail:
return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
@@ -6085,6 +7841,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return bpf_get_skb_set_tunnel_proto(func_id);
case BPF_FUNC_redirect:
return &bpf_redirect_proto;
+ case BPF_FUNC_redirect_neigh:
+ return &bpf_redirect_neigh_proto;
+ case BPF_FUNC_redirect_peer:
+ return &bpf_redirect_peer_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
@@ -6105,6 +7865,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_fib_lookup:
return &bpf_skb_fib_lookup_proto;
+ case BPF_FUNC_check_mtu:
+ return &bpf_skb_check_mtu_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
case BPF_FUNC_sk_storage_get:
@@ -6115,6 +7877,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_get_xfrm_state:
return &bpf_skb_get_xfrm_state_proto;
#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_skb_cgroup_classid:
+ return &bpf_skb_cgroup_classid_proto;
+#endif
#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto;
@@ -6140,9 +7906,23 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_ecn_set_ce_proto;
case BPF_FUNC_tcp_gen_syncookie:
return &bpf_tcp_gen_syncookie_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_assign_proto;
+ case BPF_FUNC_skb_set_tstamp:
+ return &bpf_skb_set_tstamp_proto;
+#ifdef CONFIG_SYN_COOKIES
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
+ return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
+ return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
+ return &bpf_tcp_raw_check_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
+ return &bpf_tcp_raw_check_syncookie_ipv6_proto;
+#endif
#endif
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6166,8 +7946,16 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_redirect_map_proto;
case BPF_FUNC_xdp_adjust_tail:
return &bpf_xdp_adjust_tail_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_proto;
+ case BPF_FUNC_xdp_load_bytes:
+ return &bpf_xdp_load_bytes_proto;
+ case BPF_FUNC_xdp_store_bytes:
+ return &bpf_xdp_store_bytes_proto;
case BPF_FUNC_fib_lookup:
return &bpf_xdp_fib_lookup_proto;
+ case BPF_FUNC_check_mtu:
+ return &bpf_xdp_check_mtu_proto;
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_udp:
return &bpf_xdp_sk_lookup_udp_proto;
@@ -6181,9 +7969,19 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_tcp_check_syncookie_proto;
case BPF_FUNC_tcp_gen_syncookie:
return &bpf_tcp_gen_syncookie_proto;
+#ifdef CONFIG_SYN_COOKIES
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
+ return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
+ return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
+ return &bpf_tcp_raw_check_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
+ return &bpf_tcp_raw_check_syncookie_ipv6_proto;
+#endif
#endif
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6193,11 +7991,17 @@ const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
case BPF_FUNC_setsockopt:
- return &bpf_setsockopt_proto;
+ return &bpf_sock_ops_setsockopt_proto;
case BPF_FUNC_getsockopt:
- return &bpf_getsockopt_proto;
+ return &bpf_sock_ops_getsockopt_proto;
case BPF_FUNC_sock_ops_cb_flags_set:
return &bpf_sock_ops_cb_flags_set_proto;
case BPF_FUNC_sock_map_update:
@@ -6206,20 +8010,26 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_hash_update_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_ops_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
- return &bpf_sockopt_event_output_proto;
+ return &bpf_event_output_data_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_ops_proto;
#ifdef CONFIG_INET
+ case BPF_FUNC_load_hdr_opt:
+ return &bpf_sock_ops_load_hdr_opt_proto;
+ case BPF_FUNC_store_hdr_opt:
+ return &bpf_sock_ops_store_hdr_opt_proto;
+ case BPF_FUNC_reserve_hdr_opt:
+ return &bpf_sock_ops_reserve_hdr_opt_proto;
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6244,8 +8054,30 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_msg_push_data_proto;
case BPF_FUNC_msg_pop_data:
return &bpf_msg_pop_data_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sk_msg_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6266,6 +8098,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &sk_skb_change_tail_proto;
case BPF_FUNC_skb_change_head:
return &sk_skb_change_head_proto;
+ case BPF_FUNC_skb_adjust_room:
+ return &sk_skb_adjust_room_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
@@ -6287,7 +8121,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skc_lookup_tcp_proto;
#endif
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6298,7 +8132,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_load_bytes:
return &bpf_flow_dissector_load_bytes_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6325,7 +8159,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id);
}
}
@@ -6364,6 +8198,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_csum_update:
return &bpf_csum_update_proto;
+ case BPF_FUNC_csum_level:
+ return &bpf_csum_level_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -6424,6 +8260,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
break;
case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
return false;
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
+ if (type == BPF_WRITE || size != sizeof(__u64))
+ return false;
+ break;
case bpf_ctx_range(struct __sk_buff, tstamp):
if (size != sizeof(__u64))
return false;
@@ -6433,6 +8273,11 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
return false;
info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
break;
+ case offsetof(struct __sk_buff, tstamp_type):
+ return false;
+ case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
+ /* Explicitly prohibit access to padding in __sk_buff. */
+ return false;
default:
/* Only narrow read access allowed for now. */
if (type == BPF_WRITE) {
@@ -6461,6 +8306,7 @@ static bool sk_filter_is_valid_access(int off, int size,
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
case bpf_ctx_range(struct __sk_buff, tstamp):
case bpf_ctx_range(struct __sk_buff, wire_len):
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
return false;
}
@@ -6488,7 +8334,7 @@ static bool cg_skb_is_valid_access(int off, int size,
return false;
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return false;
break;
}
@@ -6500,7 +8346,7 @@ static bool cg_skb_is_valid_access(int off, int size,
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
case bpf_ctx_range(struct __sk_buff, tstamp):
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return false;
break;
default:
@@ -6531,6 +8377,7 @@ static bool lwt_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, tstamp):
case bpf_ctx_range(struct __sk_buff, wire_len):
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
return false;
}
@@ -6568,6 +8415,7 @@ static bool __sock_filter_check_attach_type(int off,
case offsetof(struct bpf_sock, priority):
switch (attach_type) {
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
goto full_access;
default:
return false;
@@ -6617,6 +8465,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
+ int field_size;
if (off < 0 || off >= sizeof(struct bpf_sock))
return false;
@@ -6628,14 +8477,22 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
case offsetof(struct bpf_sock, family):
case offsetof(struct bpf_sock, type):
case offsetof(struct bpf_sock, protocol):
- case offsetof(struct bpf_sock, dst_port):
case offsetof(struct bpf_sock, src_port):
+ case offsetof(struct bpf_sock, rx_queue_mapping):
case bpf_ctx_range(struct bpf_sock, src_ip4):
case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
case bpf_ctx_range(struct bpf_sock, dst_ip4):
case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
+ case bpf_ctx_range(struct bpf_sock, dst_port):
+ field_size = size == size_default ?
+ size_default : sizeof_field(struct bpf_sock, dst_port);
+ bpf_ctx_record_field_size(info, field_size);
+ return bpf_ctx_narrow_access_ok(off, size, field_size);
+ case offsetofend(struct bpf_sock, dst_port) ...
+ offsetof(struct bpf_sock, dst_ip4) - 1:
+ return false;
}
return size == size_default;
@@ -6675,7 +8532,7 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
* (Fast-path, otherwise approximation that we might be
* a clone, do the rest in helper.)
*/
- *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
@@ -6706,8 +8563,6 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig,
bool indirect = BPF_MODE(orig->code) == BPF_IND;
struct bpf_insn *insn = insn_buf;
- /* We're guaranteed here that CTX is in R6. */
- *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
if (!indirect) {
*insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
} else {
@@ -6715,6 +8570,8 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig,
if (orig->imm)
*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
}
+ /* We're guaranteed here that CTX is in R6. */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
switch (BPF_SIZE(orig->code)) {
case BPF_B:
@@ -6773,11 +8630,50 @@ static bool tc_cls_act_is_valid_access(int off, int size,
break;
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
+ case offsetof(struct __sk_buff, tstamp_type):
+ /* The convert_ctx_access() on reading and writing
+ * __sk_buff->tstamp depends on whether the bpf prog
+ * has used __sk_buff->tstamp_type or not.
+ * Thus, we need to set prog->tstamp_type_access
+ * earlier during is_valid_access() here.
+ */
+ ((struct bpf_prog *)prog)->tstamp_type_access = 1;
+ return size == sizeof(__u8);
}
return bpf_skb_is_valid_access(off, size, type, prog, info);
}
+DEFINE_MUTEX(nf_conn_btf_access_lock);
+EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
+
+int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int off, int size,
+ enum bpf_access_type atype, u32 *next_btf_id,
+ enum bpf_type_flag *flag);
+EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
+
+static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int off,
+ int size, enum bpf_access_type atype,
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
+{
+ int ret = -EACCES;
+
+ if (atype == BPF_READ)
+ return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+ flag);
+
+ mutex_lock(&nf_conn_btf_access_lock);
+ if (nfct_btf_struct_access)
+ ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
+ mutex_unlock(&nf_conn_btf_access_lock);
+
+ return ret;
+}
+
static bool __is_valid_xdp_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct xdp_md))
@@ -6795,6 +8691,13 @@ static bool xdp_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
+ switch (off) {
+ case offsetof(struct xdp_md, egress_ifindex):
+ return false;
+ }
+ }
+
if (type == BPF_WRITE) {
if (bpf_prog_is_dev_bound(prog->aux)) {
switch (off) {
@@ -6820,16 +8723,37 @@ static bool xdp_is_valid_access(int off, int size,
return __is_valid_xdp_access(off, size);
}
-void bpf_warn_invalid_xdp_action(u32 act)
+void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act)
{
const u32 act_max = XDP_REDIRECT;
- WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n",
- act > act_max ? "Illegal" : "Driver unsupported",
- act);
+ pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n",
+ act > act_max ? "Illegal" : "Driver unsupported",
+ act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A");
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+static int xdp_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int off,
+ int size, enum bpf_access_type atype,
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
+{
+ int ret = -EACCES;
+
+ if (atype == BPF_READ)
+ return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+ flag);
+
+ mutex_lock(&nf_conn_btf_access_lock);
+ if (nfct_btf_struct_access)
+ ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
+ mutex_unlock(&nf_conn_btf_access_lock);
+
+ return ret;
+}
+
static bool sock_addr_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -6850,6 +8774,8 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
break;
@@ -6861,6 +8787,8 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP6_RECVMSG:
break;
@@ -6893,6 +8821,7 @@ static bool sock_addr_is_valid_access(int off, int size,
case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
msg_src_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock_addr, user_port):
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
@@ -6923,10 +8852,6 @@ static bool sock_addr_is_valid_access(int off, int size,
return false;
}
break;
- case bpf_ctx_range(struct bpf_sock_addr, user_port):
- if (size != size_default)
- return false;
- break;
case offsetof(struct bpf_sock_addr, sk):
if (type != BPF_READ)
return false;
@@ -6982,6 +8907,20 @@ static bool sock_ops_is_valid_access(int off, int size,
return false;
info->reg_type = PTR_TO_SOCKET_OR_NULL;
break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size,
+ size_default);
default:
if (size != size_default)
return false;
@@ -7008,6 +8947,7 @@ static bool sk_skb_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, tstamp):
case bpf_ctx_range(struct __sk_buff, wire_len):
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
return false;
}
@@ -7057,6 +8997,11 @@ static bool sk_msg_is_valid_access(int off, int size,
if (size != sizeof(__u64))
return false;
break;
+ case offsetof(struct sk_msg_md, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
case bpf_ctx_range(struct sk_msg_md, family):
case bpf_ctx_range(struct sk_msg_md, remote_ip4):
case bpf_ctx_range(struct sk_msg_md, local_ip4):
@@ -7140,6 +9085,114 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->dst_reg;
+ __u8 skb_reg = si->src_reg;
+ /* AX is needed because src_reg and dst_reg could be the same */
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
+ PKT_VLAN_PRESENT_OFFSET);
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
+ SKB_MONO_DELIVERY_TIME_MASK, 2);
+ *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
+ *insn++ = BPF_JMP_A(1);
+ *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
+
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ /* si->dst_reg = skb_shinfo(SKB); */
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, end));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, head));
+ *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+#else
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, end));
+#endif
+
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->dst_reg;
+ __u8 skb_reg = si->src_reg;
+
+#ifdef CONFIG_NET_CLS_ACT
+ /* If the tstamp_type is read,
+ * the bpf prog is aware the tstamp could have delivery time.
+ * Thus, read skb->tstamp as is if tstamp_type_access is true.
+ */
+ if (!prog->tstamp_type_access) {
+ /* AX is needed because src_reg and dst_reg could be the same */
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
+ TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
+ *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
+ TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
+ /* skb->tc_at_ingress && skb->mono_delivery_time,
+ * read 0 as the (rcv) timestamp.
+ */
+ *insn++ = BPF_MOV64_IMM(value_reg, 0);
+ *insn++ = BPF_JMP_A(1);
+ }
+#endif
+
+ *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
+ offsetof(struct sk_buff, tstamp));
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->src_reg;
+ __u8 skb_reg = si->dst_reg;
+
+#ifdef CONFIG_NET_CLS_ACT
+ /* If the tstamp_type is read,
+ * the bpf prog is aware the tstamp could have delivery time.
+ * Thus, write skb->tstamp as is if tstamp_type_access is true.
+ * Otherwise, writing at ingress will have to clear the
+ * mono_delivery_time bit also.
+ */
+ if (!prog->tstamp_type_access) {
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
+ /* Writing __sk_buff->tstamp as ingress, goto <clear> */
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
+ /* goto <store> */
+ *insn++ = BPF_JMP_A(2);
+ /* <clear>: mono_delivery_time */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
+ *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, PKT_VLAN_PRESENT_OFFSET);
+ }
+#endif
+
+ /* <store>: skb->tstamp = tstamp */
+ *insn++ = BPF_STX_MEM(BPF_DW, skb_reg, value_reg,
+ offsetof(struct sk_buff, tstamp));
+ return insn;
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -7214,7 +9267,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, pkt_type):
*target_size = 1;
*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
- PKT_TYPE_OFFSET());
+ PKT_TYPE_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
@@ -7239,7 +9292,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, vlan_present):
*target_size = 1;
*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
- PKT_VLAN_PRESENT_OFFSET());
+ PKT_VLAN_PRESENT_OFFSET);
if (PKT_VLAN_PRESENT_BIT)
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT);
if (PKT_VLAN_PRESENT_BIT < 7)
@@ -7448,40 +9501,31 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_DW,
- si->dst_reg, si->src_reg,
- bpf_target_off(struct sk_buff,
- tstamp, 8,
- target_size));
+ insn = bpf_convert_tstamp_write(prog, si, insn);
else
- *insn++ = BPF_LDX_MEM(BPF_DW,
- si->dst_reg, si->src_reg,
- bpf_target_off(struct sk_buff,
- tstamp, 8,
- target_size));
+ insn = bpf_convert_tstamp_read(prog, si, insn);
+ break;
+
+ case offsetof(struct __sk_buff, tstamp_type):
+ insn = bpf_convert_tstamp_type_read(si, insn);
break;
case offsetof(struct __sk_buff, gso_segs):
- /* si->dst_reg = skb_shinfo(SKB); */
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
- BPF_REG_AX, si->src_reg,
- offsetof(struct sk_buff, end));
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
- si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, head));
- *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
-#else
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
- si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, end));
-#endif
+ insn = bpf_convert_shinfo_access(si, insn);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
si->dst_reg, si->dst_reg,
bpf_target_off(struct skb_shared_info,
gso_segs, 2,
target_size));
break;
+ case offsetof(struct __sk_buff, gso_size):
+ insn = bpf_convert_shinfo_access(si, insn);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ gso_size, 2,
+ target_size));
+ break;
case offsetof(struct __sk_buff, wire_len):
BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);
@@ -7498,6 +9542,17 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
si->dst_reg, si->src_reg,
offsetof(struct sk_buff, sk));
break;
+ case offsetof(struct __sk_buff, hwtstamp):
+ BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
+ BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);
+
+ insn = bpf_convert_shinfo_access(si, insn);
+ *insn++ = BPF_LDX_MEM(BPF_DW,
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ hwtstamps, 8,
+ target_size));
+ break;
}
return insn - insn_buf;
@@ -7656,6 +9711,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
skc_state),
target_size));
break;
+ case offsetof(struct bpf_sock, rx_queue_mapping):
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_rx_queue_mapping,
+ sizeof_field(struct sock,
+ sk_rx_queue_mapping),
+ target_size));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
+ 1);
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+#else
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+ *target_size = 2;
+#endif
+ break;
}
return insn - insn_buf;
@@ -7726,6 +9798,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
offsetof(struct xdp_rxq_info,
queue_index));
break;
+ case offsetof(struct xdp_md, egress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, txq));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct xdp_txq_info, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
}
return insn - insn_buf;
@@ -7806,8 +9888,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
+ int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
struct bpf_insn *insn = insn_buf;
- int off;
switch (si->off) {
case offsetof(struct bpf_sock_addr, user_family):
@@ -7842,9 +9924,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
offsetof(struct sockaddr_in6, sin6_port));
BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
sizeof_field(struct sockaddr_in6, sin6_port));
- SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern,
- struct sockaddr_in6, uaddr,
- sin6_port, tmp_reg);
+ /* Account for sin6_port being smaller than user_port. */
+ port_size = min(port_size, BPF_LDST_BYTES(si));
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
+ sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
break;
case offsetof(struct bpf_sock_addr, family):
@@ -7900,15 +9984,31 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
do { \
+ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \
BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ fullsock_reg = reg; \
+ jmp += 2; \
+ } \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
- si->dst_reg, si->src_reg, \
+ fullsock_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, \
is_fullsock)); \
- *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
+ if (si->dst_reg == si->src_reg) \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
si->dst_reg, si->src_reg, \
@@ -7917,6 +10017,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
OBJ_FIELD), \
si->dst_reg, si->dst_reg, \
offsetof(OBJ, OBJ_FIELD)); \
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_JMP_A(1); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } \
+ } while (0)
+
+#define SOCK_OPS_GET_SK() \
+ do { \
+ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ fullsock_reg = reg; \
+ jmp += 2; \
+ } \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_fullsock), \
+ fullsock_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_fullsock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
+ if (si->dst_reg == si->src_reg) \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_JMP_A(1); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } \
} while (0)
#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
@@ -7974,17 +10117,22 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
switch (si->off) {
- case offsetof(struct bpf_sock_ops, op) ...
+ case offsetof(struct bpf_sock_ops, op):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ op),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, op));
+ break;
+
+ case offsetof(struct bpf_sock_ops, replylong[0]) ...
offsetof(struct bpf_sock_ops, replylong[3]):
- BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, op) !=
- sizeof_field(struct bpf_sock_ops_kern, op));
BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
sizeof_field(struct bpf_sock_ops_kern, reply));
BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
sizeof_field(struct bpf_sock_ops_kern, replylong));
off = si->off;
- off -= offsetof(struct bpf_sock_ops, op);
- off += offsetof(struct bpf_sock_ops_kern, op);
+ off -= offsetof(struct bpf_sock_ops, replylong[0]);
+ off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
off);
@@ -8203,22 +10351,103 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
break;
case offsetof(struct bpf_sock_ops, sk):
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct bpf_sock_ops_kern,
- is_fullsock),
+ SOCK_OPS_GET_SK();
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb_data_end),
si->dst_reg, si->src_reg,
offsetof(struct bpf_sock_ops_kern,
- is_fullsock));
+ skb_data_end));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct bpf_sock_ops_kern, sk),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, data));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_len):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
si->dst_reg, si->src_reg,
- offsetof(struct bpf_sock_ops_kern, sk));
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, len));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ off = offsetof(struct sk_buff, cb);
+ off += offsetof(struct tcp_skb_cb, tcp_flags);
+ *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
+ tcp_flags),
+ si->dst_reg, si->dst_reg, off);
break;
}
return insn - insn_buf;
}
+/* data_end = skb->data + skb_headlen() */
+static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ int reg;
+ int temp_reg_off = offsetof(struct sk_buff, cb) +
+ offsetof(struct sk_skb_cb, temp_reg);
+
+ if (si->src_reg == si->dst_reg) {
+ /* We need an extra register, choose and save a register. */
+ reg = BPF_REG_9;
+ if (si->src_reg == reg || si->dst_reg == reg)
+ reg--;
+ if (si->src_reg == reg || si->dst_reg == reg)
+ reg--;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
+ } else {
+ reg = si->dst_reg;
+ }
+
+ /* reg = skb->data */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ reg, si->src_reg,
+ offsetof(struct sk_buff, data));
+ /* AX = skb->len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, len));
+ /* reg = skb->data + skb->len */
+ *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
+ /* AX = skb->data_len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, data_len));
+
+ /* reg = skb->data + skb->len - skb->data_len */
+ *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);
+
+ if (si->src_reg == si->dst_reg) {
+ /* Restore the saved register */
+ *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
+ *insn++ = BPF_MOV64_REG(si->dst_reg, reg);
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
+ }
+
+ return insn;
+}
+
static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -8229,13 +10458,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct __sk_buff, data_end):
+ insn = bpf_convert_data_end_access(si, insn);
+ break;
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetofend(struct __sk_buff, cb[4]) - 1:
+ BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
+ BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+ offsetof(struct sk_skb_cb, data)) %
+ sizeof(__u64));
+
+ prog->cb_access = 1;
off = si->off;
- off -= offsetof(struct __sk_buff, data_end);
+ off -= offsetof(struct __sk_buff, cb[0]);
off += offsetof(struct sk_buff, cb);
- off += offsetof(struct tcp_skb_cb, bpf.data_end);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
- si->src_reg, off);
+ off += offsetof(struct sk_skb_cb, data);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
break;
+
+
default:
return bpf_convert_ctx_access(type, si, insn_buf, prog,
target_size);
@@ -8375,6 +10620,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
si->dst_reg, si->src_reg,
offsetof(struct sk_msg_sg, size));
break;
+
+ case offsetof(struct sk_msg_md, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ break;
}
return insn - insn_buf;
@@ -8397,6 +10648,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
.gen_ld_abs = bpf_gen_ld_abs,
+ .btf_struct_access = tc_cls_act_btf_struct_access,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -8408,6 +10660,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
.is_valid_access = xdp_is_valid_access,
.convert_ctx_access = xdp_convert_ctx_access,
.gen_prologue = bpf_noop_prologue,
+ .btf_struct_access = xdp_btf_struct_access,
};
const struct bpf_prog_ops xdp_prog_ops = {
@@ -8542,14 +10795,13 @@ int sk_detach_filter(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_detach_filter);
-int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
- unsigned int len)
+int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
{
struct sock_fprog_kern *fprog;
struct sk_filter *filter;
int ret = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
filter = rcu_dereference_protected(sk->sk_filter,
lockdep_sock_is_held(sk));
if (!filter)
@@ -8574,7 +10826,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
goto out;
ret = -EFAULT;
- if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
+ if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
goto out;
/* Instead of bytes, the API requests to return the number
@@ -8582,7 +10834,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
*/
ret = fprog->len;
out:
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
@@ -8590,11 +10842,13 @@ out:
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock_reuseport *reuse,
struct sock *sk, struct sk_buff *skb,
+ struct sock *migrating_sk,
u32 hash)
{
reuse_kern->skb = skb;
reuse_kern->sk = sk;
reuse_kern->selected_sk = NULL;
+ reuse_kern->migrating_sk = migrating_sk;
reuse_kern->data_end = skb->data + skb_headlen(skb);
reuse_kern->hash = hash;
reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -8603,13 +10857,14 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct bpf_prog *prog, struct sk_buff *skb,
+ struct sock *migrating_sk,
u32 hash)
{
struct sk_reuseport_kern reuse_kern;
enum sk_action action;
- bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
- action = BPF_PROG_RUN(prog, &reuse_kern);
+ bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
+ action = bpf_prog_run(prog, &reuse_kern);
if (action == SK_PASS)
return reuse_kern.selected_sk;
@@ -8620,6 +10875,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
struct bpf_map *, map, void *, key, u32, flags)
{
+ bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
struct sock_reuseport *reuse;
struct sock *selected_sk;
@@ -8628,26 +10884,24 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
return -ENOENT;
reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
- if (!reuse)
- /* selected_sk is unhashed (e.g. by close()) after the
- * above map_lookup_elem(). Treat selected_sk has already
- * been removed from the map.
+ if (!reuse) {
+ /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
+ if (sk_is_refcounted(selected_sk))
+ sock_put(selected_sk);
+
+ /* reuseport_array has only sk with non NULL sk_reuseport_cb.
+ * The only (!reuse) case here is - the sk has already been
+ * unhashed (e.g. by close()), so treat it as -ENOENT.
+ *
+ * Other maps (e.g. sock_map) do not provide this guarantee and
+ * the sk may never be in the reuseport group to begin with.
*/
- return -ENOENT;
+ return is_sockarray ? -ENOENT : -EINVAL;
+ }
if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
- struct sock *sk;
-
- if (unlikely(!reuse_kern->reuseport_id))
- /* There is a small race between adding the
- * sk to the map and setting the
- * reuse_kern->reuseport_id.
- * Treat it as the sk has not been added to
- * the bpf map yet.
- */
- return -ENOENT;
+ struct sock *sk = reuse_kern->sk;
- sk = reuse_kern->sk;
if (sk->sk_protocol != selected_sk->sk_protocol)
return -EPROTOTYPE;
else if (sk->sk_family != selected_sk->sk_family)
@@ -8719,6 +10973,10 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
return &sk_reuseport_load_bytes_proto;
case BPF_FUNC_skb_load_bytes_relative:
return &sk_reuseport_load_bytes_relative_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_ptr_cookie_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -8748,11 +11006,19 @@ sk_reuseport_is_valid_access(int off, int size,
case offsetof(struct sk_reuseport_md, hash):
return size == size_default;
+ case offsetof(struct sk_reuseport_md, sk):
+ info->reg_type = PTR_TO_SOCKET;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, migrating_sk):
+ info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+ return size == sizeof(__u64);
+
/* Fields that allow narrowing */
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
if (size < sizeof_field(struct sk_buff, protocol))
return false;
- /* fall through */
+ fallthrough;
case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
case bpf_ctx_range(struct sk_reuseport_md, len):
@@ -8820,6 +11086,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct sk_reuseport_md, bind_inany):
SK_REUSEPORT_LOAD_FIELD(bind_inany);
break;
+
+ case offsetof(struct sk_reuseport_md, sk):
+ SK_REUSEPORT_LOAD_FIELD(sk);
+ break;
+
+ case offsetof(struct sk_reuseport_md, migrating_sk):
+ SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+ break;
}
return insn - insn_buf;
@@ -8833,12 +11107,427 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
const struct bpf_prog_ops sk_reuseport_prog_ops = {
};
+
+DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
+EXPORT_SYMBOL(bpf_sk_lookup_enabled);
+
+BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
+ struct sock *, sk, u64, flags)
+{
+ if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
+ BPF_SK_LOOKUP_F_NO_REUSEPORT)))
+ return -EINVAL;
+ if (unlikely(sk && sk_is_refcounted(sk)))
+ return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
+ if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
+ return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
+ if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
+ return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */
+
+ /* Check if socket is suitable for packet L3/L4 protocol */
+ if (sk && sk->sk_protocol != ctx->protocol)
+ return -EPROTOTYPE;
+ if (sk && sk->sk_family != ctx->family &&
+ (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
+ return -EAFNOSUPPORT;
+
+ if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
+ return -EEXIST;
+
+ /* Select socket as lookup result */
+ ctx->selected_sk = sk;
+ ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
+ .func = bpf_sk_lookup_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_lookup_assign_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ default:
+ return bpf_sk_base_func_proto(func_id);
+ }
+}
+
+static bool sk_lookup_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case offsetof(struct bpf_sk_lookup, sk):
+ info->reg_type = PTR_TO_SOCKET_OR_NULL;
+ return size == sizeof(__u64);
+
+ case bpf_ctx_range(struct bpf_sk_lookup, family):
+ case bpf_ctx_range(struct bpf_sk_lookup, protocol):
+ case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
+ case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
+ case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range(struct bpf_sk_lookup, local_port):
+ case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
+ bpf_ctx_record_field_size(info, sizeof(__u32));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
+
+ case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+ /* Allow 4-byte access to 2-byte field for backward compatibility */
+ if (size == sizeof(__u32))
+ return true;
+ bpf_ctx_record_field_size(info, sizeof(__be16));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));
+
+ case offsetofend(struct bpf_sk_lookup, remote_port) ...
+ offsetof(struct bpf_sk_lookup, local_ip4) - 1:
+ /* Allow access to zero padding for backward compatibility */
+ bpf_ctx_record_field_size(info, sizeof(__u16));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));
+
+ default:
+ return false;
+ }
+}
+
+static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sk_lookup, sk):
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sk_lookup_kern, selected_sk));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, family):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ family, 2, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ protocol, 2, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, remote_ip4):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ v4.saddr, 4, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, local_ip4):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ v4.daddr, 4, target_size));
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sk_lookup,
+ remote_ip6[0], remote_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+ int off = si->off;
+
+ off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
+ off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sk_lookup_kern, v6.saddr));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ }
+ case bpf_ctx_range_till(struct bpf_sk_lookup,
+ local_ip6[0], local_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+ int off = si->off;
+
+ off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
+ off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sk_lookup_kern, v6.daddr));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ }
+ case offsetof(struct bpf_sk_lookup, remote_port):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ sport, 2, target_size));
+ break;
+
+ case offsetofend(struct bpf_sk_lookup, remote_port):
+ *target_size = 2;
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+ break;
+
+ case offsetof(struct bpf_sk_lookup, local_port):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ dport, 2, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, ingress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ ingress_ifindex, 4, target_size));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_prog_ops sk_lookup_prog_ops = {
+ .test_run = bpf_prog_test_run_sk_lookup,
+};
+
+const struct bpf_verifier_ops sk_lookup_verifier_ops = {
+ .get_func_proto = sk_lookup_func_proto,
+ .is_valid_access = sk_lookup_is_valid_access,
+ .convert_ctx_access = sk_lookup_convert_ctx_access,
+};
+
#endif /* CONFIG_INET */
-DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+DEFINE_BPF_DISPATCHER(xdp)
void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
{
- bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp),
- prev_prog, prog);
+ bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
+}
+
+BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
+#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
+BTF_SOCK_TYPE_xxx
+#undef BTF_SOCK_TYPE
+
+BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
+{
+ /* tcp6_sock type is not generated in dwarf and hence btf,
+ * trigger an explicit type generation here.
+ */
+ BTF_TYPE_EMIT(struct tcp6_sock);
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_family == AF_INET6)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
+ .func = bpf_skc_to_tcp6_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
+{
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
+ .func = bpf_skc_to_tcp_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
+{
+ /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
+ * generated if CONFIG_INET=n. Trigger an explicit generation here.
+ */
+ BTF_TYPE_EMIT(struct inet_timewait_sock);
+ BTF_TYPE_EMIT(struct tcp_timewait_sock);
+
+#ifdef CONFIG_INET
+ if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
+ return (unsigned long)sk;
+#endif
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
+ return (unsigned long)sk;
+#endif
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
+ .func = bpf_skc_to_tcp_timewait_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
+{
+#ifdef CONFIG_INET
+ if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
+ return (unsigned long)sk;
+#endif
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
+ return (unsigned long)sk;
+#endif
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
+ .func = bpf_skc_to_tcp_request_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
+};
+
+BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
+{
+ /* udp6_sock type is not generated in dwarf and hence btf,
+ * trigger an explicit type generation here.
+ */
+ BTF_TYPE_EMIT(struct udp6_sock);
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
+ sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
+ .func = bpf_skc_to_udp6_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
+};
+
+BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
+{
+ /* unix_sock type is not generated in dwarf and hence btf,
+ * trigger an explicit type generation here.
+ */
+ BTF_TYPE_EMIT(struct unix_sock);
+ if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
+ .func = bpf_skc_to_unix_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
+};
+
+BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
+{
+ BTF_TYPE_EMIT(struct mptcp_sock);
+ return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
+}
+
+const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
+ .func = bpf_skc_to_mptcp_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
+};
+
+BPF_CALL_1(bpf_sock_from_file, struct file *, file)
+{
+ return (unsigned long)sock_from_file(file);
+}
+
+BTF_ID_LIST(bpf_sock_from_file_btf_ids)
+BTF_ID(struct, socket)
+BTF_ID(struct, file)
+
+const struct bpf_func_proto bpf_sock_from_file_proto = {
+ .func = bpf_sock_from_file,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .ret_btf_id = &bpf_sock_from_file_btf_ids[0],
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_sock_from_file_btf_ids[1],
+};
+
+static const struct bpf_func_proto *
+bpf_sk_base_func_proto(enum bpf_func_id func_id)
+{
+ const struct bpf_func_proto *func;
+
+ switch (func_id) {
+ case BPF_FUNC_skc_to_tcp6_sock:
+ func = &bpf_skc_to_tcp6_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_sock:
+ func = &bpf_skc_to_tcp_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_timewait_sock:
+ func = &bpf_skc_to_tcp_timewait_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_request_sock:
+ func = &bpf_skc_to_tcp_request_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_udp6_sock:
+ func = &bpf_skc_to_udp6_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_unix_sock:
+ func = &bpf_skc_to_unix_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_mptcp_sock:
+ func = &bpf_skc_to_mptcp_sock_proto;
+ break;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+
+ if (!perfmon_capable())
+ return NULL;
+
+ return func;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a1670dff0629..25cd35f5922e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -5,6 +5,7 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
+#include <linux/filter.h>
#include <net/dsa.h>
#include <net/dst_metadata.h>
#include <net/ip.h>
@@ -21,8 +22,10 @@
#include <linux/ppp_defs.h>
#include <linux/stddef.h>
#include <linux/if_ether.h>
+#include <linux/if_hsr.h>
#include <linux/mpls.h>
#include <linux/tcp.h>
+#include <linux/ptp_classify.h>
#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
@@ -31,8 +34,7 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_labels.h>
#endif
-
-static DEFINE_MUTEX(flow_dissector_mutex);
+#include <linux/bpf-netns.h>
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
@@ -49,7 +51,7 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
memset(flow_dissector, 0, sizeof(*flow_dissector));
for (i = 0; i < key_count; i++, key++) {
- /* User should make sure that every key target offset is withing
+ /* User should make sure that every key target offset is within
* boundaries of unsigned short.
*/
BUG_ON(key->offset > USHRT_MAX);
@@ -70,54 +72,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
}
EXPORT_SYMBOL(skb_flow_dissector_init);
-int skb_flow_dissector_prog_query(const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+#ifdef CONFIG_BPF_SYSCALL
+int flow_dissector_bpf_prog_attach_check(struct net *net,
+ struct bpf_prog *prog)
{
- __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
- u32 prog_id, prog_cnt = 0, flags = 0;
- struct bpf_prog *attached;
- struct net *net;
-
- if (attr->query.query_flags)
- return -EINVAL;
-
- net = get_net_ns_by_fd(attr->query.target_fd);
- if (IS_ERR(net))
- return PTR_ERR(net);
-
- rcu_read_lock();
- attached = rcu_dereference(net->flow_dissector_prog);
- if (attached) {
- prog_cnt = 1;
- prog_id = attached->aux->id;
- }
- rcu_read_unlock();
-
- put_net(net);
-
- if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
- return -EFAULT;
- if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
- return -EFAULT;
-
- if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
- return 0;
-
- if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
- return -EFAULT;
-
- return 0;
-}
-
-int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
- struct bpf_prog *prog)
-{
- struct bpf_prog *attached;
- struct net *net;
- int ret = 0;
-
- net = current->nsproxy->net_ns;
- mutex_lock(&flow_dissector_mutex);
+ enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
if (net == &init_net) {
/* BPF flow dissector in the root namespace overrides
@@ -130,54 +89,20 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
for_each_net(ns) {
if (ns == &init_net)
continue;
- if (rcu_access_pointer(ns->flow_dissector_prog)) {
- ret = -EEXIST;
- goto out;
- }
+ if (rcu_access_pointer(ns->bpf.run_array[type]))
+ return -EEXIST;
}
} else {
/* Make sure root flow dissector is not attached
* when attaching to the non-root namespace.
*/
- if (rcu_access_pointer(init_net.flow_dissector_prog)) {
- ret = -EEXIST;
- goto out;
- }
- }
-
- attached = rcu_dereference_protected(net->flow_dissector_prog,
- lockdep_is_held(&flow_dissector_mutex));
- if (attached == prog) {
- /* The same program cannot be attached twice */
- ret = -EINVAL;
- goto out;
+ if (rcu_access_pointer(init_net.bpf.run_array[type]))
+ return -EEXIST;
}
- rcu_assign_pointer(net->flow_dissector_prog, prog);
- if (attached)
- bpf_prog_put(attached);
-out:
- mutex_unlock(&flow_dissector_mutex);
- return ret;
-}
-int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
-{
- struct bpf_prog *attached;
- struct net *net;
-
- net = current->nsproxy->net_ns;
- mutex_lock(&flow_dissector_mutex);
- attached = rcu_dereference_protected(net->flow_dissector_prog,
- lockdep_is_held(&flow_dissector_mutex));
- if (!attached) {
- mutex_unlock(&flow_dissector_mutex);
- return -ENOENT;
- }
- RCU_INIT_POINTER(net->flow_dissector_prog, NULL);
- bpf_prog_put(attached);
- mutex_unlock(&flow_dissector_mutex);
return 0;
}
+#endif /* CONFIG_BPF_SYSCALL */
/**
* __skb_flow_get_ports - extract the upper layer ports and return them
@@ -191,7 +116,7 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
* is the protocol port offset returned from proto_ports_offset
*/
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- void *data, int hlen)
+ const void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
@@ -238,7 +163,7 @@ static bool icmp_has_id(u8 type)
*/
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
struct flow_dissector_key_icmp *key_icmp,
- void *data, int thoff, int hlen)
+ const void *data, int thoff, int hlen)
{
struct icmphdr *ih, _ih;
@@ -253,7 +178,7 @@ void skb_flow_get_icmp_tci(const struct sk_buff *skb,
* avoid confusion with packets without such field
*/
if (icmp_has_id(ih->type))
- key_icmp->id = ih->un.echo.id ? : 1;
+ key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
else
key_icmp->id = 0;
}
@@ -264,8 +189,8 @@ EXPORT_SYMBOL(skb_flow_get_icmp_tci);
*/
static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, int thoff, int hlen)
+ void *target_container, const void *data,
+ int thoff, int hlen)
{
struct flow_dissector_key_icmp *key_icmp;
@@ -279,6 +204,30 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
}
+static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_l2tpv3 *key_l2tpv3;
+ struct {
+ __be32 session_id;
+ } *hdr, _hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_l2tpv3 = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_L2TPV3,
+ target_container);
+
+ key_l2tpv3->session_id = hdr->session_id;
+}
+
void skb_flow_dissect_meta(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container)
@@ -314,9 +263,8 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
void
skb_flow_dissect_ct(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- u16 *ctinfo_map,
- size_t mapsize)
+ void *target_container, u16 *ctinfo_map,
+ size_t mapsize, bool post_ct, u16 zone)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
struct flow_dissector_key_ct *key;
@@ -328,13 +276,20 @@ skb_flow_dissect_ct(const struct sk_buff *skb,
return;
ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
+ if (!ct && !post_ct)
return;
key = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_CT,
target_container);
+ if (!ct) {
+ key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_INVALID;
+ key->ct_zone = zone;
+ return;
+ }
+
if (ctinfo < mapsize)
key->ct_state = ctinfo_map[ctinfo];
#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
@@ -461,56 +416,86 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
+void skb_flow_dissect_hash(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_hash *key;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH))
+ return;
+
+ key = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_HASH,
+ target_container);
+
+ key->hash = skb_get_hash_raw(skb);
+}
+EXPORT_SYMBOL(skb_flow_dissect_hash);
+
static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen)
+ void *target_container, const void *data, int nhoff,
+ int hlen, int lse_index, bool *entropy_label)
{
- struct flow_dissector_key_keyid *key_keyid;
- struct mpls_label *hdr, _hdr[2];
- u32 entry, label;
+ struct mpls_label *hdr, _hdr;
+ u32 entry, label, bos;
if (!dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
return FLOW_DISSECT_RET_OUT_GOOD;
+ if (lse_index >= FLOW_DIS_MPLS_MAX)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
hlen, &_hdr);
if (!hdr)
return FLOW_DISSECT_RET_OUT_BAD;
- entry = ntohl(hdr[0].entry);
+ entry = ntohl(hdr->entry);
label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+ bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
struct flow_dissector_key_mpls *key_mpls;
+ struct flow_dissector_mpls_lse *lse;
key_mpls = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_MPLS,
target_container);
- key_mpls->mpls_label = label;
- key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK)
- >> MPLS_LS_TTL_SHIFT;
- key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK)
- >> MPLS_LS_TC_SHIFT;
- key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK)
- >> MPLS_LS_S_SHIFT;
+ lse = &key_mpls->ls[lse_index];
+
+ lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ lse->mpls_bos = bos;
+ lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
+ lse->mpls_label = label;
+ dissector_set_mpls_lse(key_mpls, lse_index);
}
- if (label == MPLS_LABEL_ENTROPY) {
+ if (*entropy_label &&
+ dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+ struct flow_dissector_key_keyid *key_keyid;
+
key_keyid = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
target_container);
- key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK);
+ key_keyid->keyid = cpu_to_be32(label);
}
- return FLOW_DISSECT_RET_OUT_GOOD;
+
+ *entropy_label = label == MPLS_LABEL_ENTROPY;
+
+ return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN;
}
static enum flow_dissect_ret
__skb_flow_dissect_arp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen)
+ void *target_container, const void *data,
+ int nhoff, int hlen)
{
struct flow_dissector_key_arp *key_arp;
struct {
@@ -566,7 +551,7 @@ static enum flow_dissect_ret
__skb_flow_dissect_gre(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
struct flow_dissector *flow_dissector,
- void *target_container, void *data,
+ void *target_container, const void *data,
__be16 *p_proto, int *p_nhoff, int *p_hlen,
unsigned int flags)
{
@@ -706,8 +691,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
static enum flow_dissect_ret
__skb_flow_dissect_batadv(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
- void *data, __be16 *p_proto, int *p_nhoff, int hlen,
- unsigned int flags)
+ const void *data, __be16 *p_proto, int *p_nhoff,
+ int hlen, unsigned int flags)
{
struct {
struct batadv_unicast_packet batadv_unicast;
@@ -738,7 +723,8 @@ __skb_flow_dissect_batadv(const struct sk_buff *skb,
static void
__skb_flow_dissect_tcp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int thoff, int hlen)
+ void *target_container, const void *data,
+ int thoff, int hlen)
{
struct flow_dissector_key_tcp *key_tcp;
struct tcphdr *th, _th;
@@ -762,8 +748,8 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb,
static void
__skb_flow_dissect_ports(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff,
- u8 ip_proto, int hlen)
+ void *target_container, const void *data,
+ int nhoff, u8 ip_proto, int hlen)
{
enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
struct flow_dissector_key_ports *key_ports;
@@ -787,7 +773,8 @@ __skb_flow_dissect_ports(const struct sk_buff *skb,
static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct iphdr *iph)
+ void *target_container, const void *data,
+ const struct iphdr *iph)
{
struct flow_dissector_key_ip *key_ip;
@@ -804,7 +791,8 @@ __skb_flow_dissect_ipv4(const struct sk_buff *skb,
static void
__skb_flow_dissect_ipv6(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct ipv6hdr *iph)
+ void *target_container, const void *data,
+ const struct ipv6hdr *iph)
{
struct flow_dissector_key_ip *key_ip;
@@ -871,8 +859,10 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
key_addrs = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_IPV6_ADDRS,
target_container);
- memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src,
- sizeof(key_addrs->v6addrs));
+ memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst,
+ sizeof(key_addrs->v6addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -900,8 +890,8 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
}
}
-bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
- __be16 proto, int nhoff, int hlen, unsigned int flags)
+u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
{
struct bpf_flow_keys *flow_keys = ctx->flow_keys;
u32 result;
@@ -920,15 +910,18 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
(int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
flow_keys->flags = flags;
- preempt_disable();
- result = BPF_PROG_RUN(prog, ctx);
- preempt_enable();
+ result = bpf_prog_run_pin_on_cpu(prog, ctx);
flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
flow_keys->nhoff, hlen);
- return result == BPF_OK;
+ return result;
+}
+
+static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr)
+{
+ return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0;
}
/**
@@ -953,18 +946,18 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, __be16 proto, int nhoff, int hlen,
- unsigned int flags)
+ void *target_container, const void *data,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
{
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
- struct bpf_prog *attached = NULL;
enum flow_dissect_ret fdret;
enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+ bool mpls_el = false;
+ int mpls_lse = 0;
int num_hdrs = 0;
u8 ip_proto = 0;
bool ret;
@@ -982,8 +975,14 @@ bool __skb_flow_dissect(const struct net *net,
int offset = 0;
ops = skb->dev->dsa_ptr->tag_ops;
- if (ops->flow_dissect &&
- !ops->flow_dissect(skb, &proto, &offset)) {
+ /* Only DSA header taggers break flow dissection */
+ if (ops->needed_headroom) {
+ if (ops->flow_dissect)
+ ops->flow_dissect(skb, &proto, &offset);
+ else
+ dsa_tag_generic_flow_dissect(skb,
+ &proto,
+ &offset);
hlen -= offset;
nhoff += offset;
}
@@ -1016,13 +1015,15 @@ bool __skb_flow_dissect(const struct net *net,
WARN_ON_ONCE(!net);
if (net) {
- rcu_read_lock();
- attached = rcu_dereference(init_net.flow_dissector_prog);
+ enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
+ struct bpf_prog_array *run_array;
- if (!attached)
- attached = rcu_dereference(net->flow_dissector_prog);
+ rcu_read_lock();
+ run_array = rcu_dereference(init_net.bpf.run_array[type]);
+ if (!run_array)
+ run_array = rcu_dereference(net->bpf.run_array[type]);
- if (attached) {
+ if (run_array) {
struct bpf_flow_keys flow_keys;
struct bpf_flow_dissector ctx = {
.flow_keys = &flow_keys,
@@ -1030,6 +1031,8 @@ bool __skb_flow_dissect(const struct net *net,
.data_end = data + hlen,
};
__be16 n_proto = proto;
+ struct bpf_prog *prog;
+ u32 result;
if (skb) {
ctx.skb = skb;
@@ -1040,13 +1043,17 @@ bool __skb_flow_dissect(const struct net *net,
n_proto = skb->protocol;
}
- ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff,
- hlen, flags);
+ prog = READ_ONCE(run_array->items[0].prog);
+ result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
+ hlen, flags);
+ if (result == BPF_FLOW_DISSECTOR_CONTINUE)
+ goto dissect_continue;
__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container);
rcu_read_unlock();
- return ret;
+ return result == BPF_OK;
}
+dissect_continue:
rcu_read_unlock();
}
@@ -1058,7 +1065,17 @@ bool __skb_flow_dissect(const struct net *net,
key_eth_addrs = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS,
target_container);
- memcpy(key_eth_addrs, &eth->h_dest, sizeof(*key_eth_addrs));
+ memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs));
+ }
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) {
+ struct flow_dissector_key_num_of_vlans *key_num_of_vlans;
+
+ key_num_of_vlans = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
+ target_container);
+ key_num_of_vlans->num_of_vlans = 0;
}
proto_again:
@@ -1085,11 +1102,16 @@ proto_again:
FLOW_DISSECTOR_KEY_IPV4_ADDRS,
target_container);
- memcpy(&key_addrs->v4addrs, &iph->saddr,
- sizeof(key_addrs->v4addrs));
+ memcpy(&key_addrs->v4addrs.src, &iph->saddr,
+ sizeof(key_addrs->v4addrs.src));
+ memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v4addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
}
+ __skb_flow_dissect_ipv4(skb, flow_dissector,
+ target_container, data, iph);
+
if (ip_is_fragment(iph)) {
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
@@ -1106,9 +1128,6 @@ proto_again:
}
}
- __skb_flow_dissect_ipv4(skb, flow_dissector,
- target_container, data, iph);
-
break;
}
case htons(ETH_P_IPV6): {
@@ -1130,8 +1149,10 @@ proto_again:
FLOW_DISSECTOR_KEY_IPV6_ADDRS,
target_container);
- memcpy(&key_addrs->v6addrs, &iph->saddr,
- sizeof(key_addrs->v6addrs));
+ memcpy(&key_addrs->v6addrs.src, &iph->saddr,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v6addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -1180,6 +1201,16 @@ proto_again:
nhoff += sizeof(*vlan);
}
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) &&
+ !(key_control->flags & FLOW_DIS_ENCAPSULATION)) {
+ struct flow_dissector_key_num_of_vlans *key_nvs;
+
+ key_nvs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
+ target_container);
+ key_nvs->num_of_vlans++;
+ }
+
if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
} else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
@@ -1205,6 +1236,7 @@ proto_again:
VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
}
key_vlan->vlan_tpid = saved_vlan_tpid;
+ key_vlan->vlan_eth_type = proto;
}
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
@@ -1215,27 +1247,60 @@ proto_again:
struct pppoe_hdr hdr;
__be16 proto;
} *hdr, _hdr;
+ u16 ppp_proto;
+
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr) {
fdret = FLOW_DISSECT_RET_OUT_BAD;
break;
}
- proto = hdr->proto;
- nhoff += PPPOE_SES_HLEN;
- switch (proto) {
- case htons(PPP_IP):
+ if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ /* least significant bit of the most significant octet
+ * indicates if protocol field was compressed
+ */
+ ppp_proto = ntohs(hdr->proto);
+ if (ppp_proto & 0x0100) {
+ ppp_proto = ppp_proto >> 8;
+ nhoff += PPPOE_SES_HLEN - 1;
+ } else {
+ nhoff += PPPOE_SES_HLEN;
+ }
+
+ if (ppp_proto == PPP_IP) {
proto = htons(ETH_P_IP);
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
- break;
- case htons(PPP_IPV6):
+ } else if (ppp_proto == PPP_IPV6) {
proto = htons(ETH_P_IPV6);
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
- break;
- default:
+ } else if (ppp_proto == PPP_MPLS_UC) {
+ proto = htons(ETH_P_MPLS_UC);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ } else if (ppp_proto == PPP_MPLS_MC) {
+ proto = htons(ETH_P_MPLS_MC);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ } else if (ppp_proto_is_valid(ppp_proto)) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ } else {
fdret = FLOW_DISSECT_RET_OUT_BAD;
break;
}
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PPPOE)) {
+ struct flow_dissector_key_pppoe *key_pppoe;
+
+ key_pppoe = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PPPOE,
+ target_container);
+ key_pppoe->session_id = hdr->hdr.sid;
+ key_pppoe->ppp_proto = htons(ppp_proto);
+ key_pppoe->type = htons(ETH_P_PPP_SES);
+ }
break;
}
case htons(ETH_P_TIPC): {
@@ -1264,7 +1329,10 @@ proto_again:
case htons(ETH_P_MPLS_MC):
fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
target_container, data,
- nhoff, hlen);
+ nhoff, hlen, mpls_lse,
+ &mpls_el);
+ nhoff += sizeof(struct mpls_label);
+ mpls_lse++;
break;
case htons(ETH_P_FCOE):
if ((hlen - nhoff) < FCOE_HEADER_LEN) {
@@ -1288,6 +1356,38 @@ proto_again:
&proto, &nhoff, hlen, flags);
break;
+ case htons(ETH_P_1588): {
+ struct ptp_header *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+ hlen, &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ nhoff += ntohs(hdr->message_length);
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ case htons(ETH_P_PRP):
+ case htons(ETH_P_HSR): {
+ struct hsr_tag *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen,
+ &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ proto = hdr->encap_proto;
+ nhoff += HSR_HLEN;
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+
default:
fdret = FLOW_DISSECT_RET_OUT_BAD;
break;
@@ -1314,6 +1414,11 @@ ip_proto_again:
switch (ip_proto) {
case IPPROTO_GRE:
+ if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
target_container, data,
&proto, &nhoff, &hlen, flags);
@@ -1371,6 +1476,11 @@ ip_proto_again:
break;
}
case IPPROTO_IPIP:
+ if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
proto = htons(ETH_P_IP);
key_control->flags |= FLOW_DIS_ENCAPSULATION;
@@ -1383,6 +1493,11 @@ ip_proto_again:
break;
case IPPROTO_IPV6:
+ if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
proto = htons(ETH_P_IPV6);
key_control->flags |= FLOW_DIS_ENCAPSULATION;
@@ -1410,6 +1525,10 @@ ip_proto_again:
__skb_flow_dissect_icmp(skb, flow_dissector, target_container,
data, nhoff, hlen);
break;
+ case IPPROTO_L2TP:
+ __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
default:
break;
@@ -1453,7 +1572,7 @@ out_bad:
}
EXPORT_SYMBOL(__skb_flow_dissect);
-static siphash_key_t hashrnd __read_mostly;
+static siphash_aligned_key_t hashrnd;
static __always_inline void __flow_hash_secret_init(void)
{
net_get_random_once(&hashrnd, sizeof(hashrnd));
@@ -1515,7 +1634,7 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow)
}
EXPORT_SYMBOL(flow_get_u32_dst);
-/* Sort the source and destination IP (and the ports if the IP are the same),
+/* Sort the source and destination IP and the ports,
* to have consistent hash within the two directions
*/
static inline void __flow_hash_consistentify(struct flow_keys *keys)
@@ -1524,13 +1643,12 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
switch (keys->control.addr_type) {
case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
- addr_diff = (__force u32)keys->addrs.v4addrs.dst -
- (__force u32)keys->addrs.v4addrs.src;
- if ((addr_diff < 0) ||
- (addr_diff == 0 &&
- ((__force u16)keys->ports.dst <
- (__force u16)keys->ports.src))) {
+ if ((__force u32)keys->addrs.v4addrs.dst <
+ (__force u32)keys->addrs.v4addrs.src)
swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
+
+ if ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src) {
swap(keys->ports.src, keys->ports.dst);
}
break;
@@ -1538,13 +1656,13 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
addr_diff = memcmp(&keys->addrs.v6addrs.dst,
&keys->addrs.v6addrs.src,
sizeof(keys->addrs.v6addrs.dst));
- if ((addr_diff < 0) ||
- (addr_diff == 0 &&
- ((__force u16)keys->ports.dst <
- (__force u16)keys->ports.src))) {
+ if (addr_diff < 0) {
for (i = 0; i < 4; i++)
swap(keys->addrs.v6addrs.src.s6_addr32[i],
keys->addrs.v6addrs.dst.s6_addr32[i]);
+ }
+ if ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src) {
swap(keys->ports.src, keys->ports.dst);
}
break;
@@ -1658,7 +1776,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb,
}
EXPORT_SYMBOL(skb_get_hash_perturb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
const struct flow_keys_basic *keys, int hlen)
{
u32 poff = keys->control.thoff;
@@ -1840,5 +1958,4 @@ static int __init init_default_flow_dissectors(void)
ARRAY_SIZE(flow_keys_basic_dissector_keys));
return 0;
}
-
core_initcall(init_default_flow_dissectors);
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 45b6a59ac124..abe423fd5736 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -1,13 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <net/act_api.h>
#include <net/flow_offload.h>
#include <linux/rtnetlink.h>
#include <linux/mutex.h>
+#include <linux/rhashtable.h>
struct flow_rule *flow_rule_alloc(unsigned int num_actions)
{
struct flow_rule *rule;
+ int i;
rule = kzalloc(struct_size(rule, action.entries, num_actions),
GFP_KERNEL);
@@ -15,11 +18,36 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions)
return NULL;
rule->action.num_entries = num_actions;
+ /* Pre-fill each action hw_stats with DONT_CARE.
+ * Caller can override this if it wants stats for a given action.
+ */
+ for (i = 0; i < num_actions; i++)
+ rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
return rule;
}
EXPORT_SYMBOL(flow_rule_alloc);
+struct flow_offload_action *offload_action_alloc(unsigned int num_actions)
+{
+ struct flow_offload_action *fl_action;
+ int i;
+
+ fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions),
+ GFP_KERNEL);
+ if (!fl_action)
+ return NULL;
+
+ fl_action->action.num_entries = num_actions;
+ /* Pre-fill each action hw_stats with DONT_CARE.
+ * Caller can override this if it wants stats for a given action.
+ */
+ for (i = 0; i < num_actions; i++)
+ fl_action->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
+
+ return fl_action;
+}
+
#define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \
const struct flow_match *__m = &(__rule)->match; \
struct flow_dissector *__d = (__m)->dissector; \
@@ -97,6 +125,13 @@ void flow_rule_match_ports(const struct flow_rule *rule,
}
EXPORT_SYMBOL(flow_rule_match_ports);
+void flow_rule_match_ports_range(const struct flow_rule *rule,
+ struct flow_match_ports_range *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS_RANGE, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ports_range);
+
void flow_rule_match_tcp(const struct flow_rule *rule,
struct flow_match_tcp *out)
{
@@ -167,6 +202,48 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule,
}
EXPORT_SYMBOL(flow_rule_match_enc_opts);
+struct flow_action_cookie *flow_action_cookie_create(void *data,
+ unsigned int len,
+ gfp_t gfp)
+{
+ struct flow_action_cookie *cookie;
+
+ cookie = kmalloc(sizeof(*cookie) + len, gfp);
+ if (!cookie)
+ return NULL;
+ cookie->cookie_len = len;
+ memcpy(cookie->cookie, data, len);
+ return cookie;
+}
+EXPORT_SYMBOL(flow_action_cookie_create);
+
+void flow_action_cookie_destroy(struct flow_action_cookie *cookie)
+{
+ kfree(cookie);
+}
+EXPORT_SYMBOL(flow_action_cookie_destroy);
+
+void flow_rule_match_ct(const struct flow_rule *rule,
+ struct flow_match_ct *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ct);
+
+void flow_rule_match_pppoe(const struct flow_rule *rule,
+ struct flow_match_pppoe *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PPPOE, out);
+}
+EXPORT_SYMBOL(flow_rule_match_pppoe);
+
+void flow_rule_match_l2tpv3(const struct flow_rule *rule,
+ struct flow_match_l2tpv3 *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out);
+}
+EXPORT_SYMBOL(flow_rule_match_l2tpv3);
+
struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb,
void *cb_ident, void *cb_priv,
void (*release)(void *cb_priv))
@@ -283,240 +360,265 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
}
EXPORT_SYMBOL(flow_block_cb_setup_simple);
-static LIST_HEAD(block_cb_list);
-
-static struct rhashtable indr_setup_block_ht;
-
-struct flow_indr_block_cb {
- struct list_head list;
- void *cb_priv;
- flow_indr_block_bind_cb_t *cb;
- void *cb_ident;
-};
-
-struct flow_indr_block_dev {
- struct rhash_head ht_node;
- struct net_device *dev;
- unsigned int refcnt;
- struct list_head cb_list;
-};
+static DEFINE_MUTEX(flow_indr_block_lock);
+static LIST_HEAD(flow_block_indr_list);
+static LIST_HEAD(flow_block_indr_dev_list);
+static LIST_HEAD(flow_indir_dev_list);
-static const struct rhashtable_params flow_indr_setup_block_ht_params = {
- .key_offset = offsetof(struct flow_indr_block_dev, dev),
- .head_offset = offsetof(struct flow_indr_block_dev, ht_node),
- .key_len = sizeof(struct net_device *),
+struct flow_indr_dev {
+ struct list_head list;
+ flow_indr_block_bind_cb_t *cb;
+ void *cb_priv;
+ refcount_t refcnt;
};
-static struct flow_indr_block_dev *
-flow_indr_block_dev_lookup(struct net_device *dev)
-{
- return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
- flow_indr_setup_block_ht_params);
-}
-
-static struct flow_indr_block_dev *
-flow_indr_block_dev_get(struct net_device *dev)
+static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb,
+ void *cb_priv)
{
- struct flow_indr_block_dev *indr_dev;
+ struct flow_indr_dev *indr_dev;
- indr_dev = flow_indr_block_dev_lookup(dev);
- if (indr_dev)
- goto inc_ref;
-
- indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
+ indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL);
if (!indr_dev)
return NULL;
- INIT_LIST_HEAD(&indr_dev->cb_list);
- indr_dev->dev = dev;
- if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- flow_indr_setup_block_ht_params)) {
- kfree(indr_dev);
- return NULL;
- }
+ indr_dev->cb = cb;
+ indr_dev->cb_priv = cb_priv;
+ refcount_set(&indr_dev->refcnt, 1);
-inc_ref:
- indr_dev->refcnt++;
return indr_dev;
}
-static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev)
-{
- if (--indr_dev->refcnt)
- return;
-
- rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
- flow_indr_setup_block_ht_params);
- kfree(indr_dev);
-}
+struct flow_indir_dev_info {
+ void *data;
+ struct net_device *dev;
+ struct Qdisc *sch;
+ enum tc_setup_type type;
+ void (*cleanup)(struct flow_block_cb *block_cb);
+ struct list_head list;
+ enum flow_block_command command;
+ enum flow_block_binder_type binder_type;
+ struct list_head *cb_list;
+};
-static struct flow_indr_block_cb *
-flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev,
- flow_indr_block_bind_cb_t *cb, void *cb_ident)
+static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
{
- struct flow_indr_block_cb *indr_block_cb;
+ struct flow_block_offload bo;
+ struct flow_indir_dev_info *cur;
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- if (indr_block_cb->cb == cb &&
- indr_block_cb->cb_ident == cb_ident)
- return indr_block_cb;
- return NULL;
+ list_for_each_entry(cur, &flow_indir_dev_list, list) {
+ memset(&bo, 0, sizeof(bo));
+ bo.command = cur->command;
+ bo.binder_type = cur->binder_type;
+ INIT_LIST_HEAD(&bo.cb_list);
+ cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup);
+ list_splice(&bo.cb_list, cur->cb_list);
+ }
}
-static struct flow_indr_block_cb *
-flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv,
- flow_indr_block_bind_cb_t *cb, void *cb_ident)
+int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
{
- struct flow_indr_block_cb *indr_block_cb;
+ struct flow_indr_dev *indr_dev;
- indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (indr_block_cb)
- return ERR_PTR(-EEXIST);
+ mutex_lock(&flow_indr_block_lock);
+ list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) {
+ if (indr_dev->cb == cb &&
+ indr_dev->cb_priv == cb_priv) {
+ refcount_inc(&indr_dev->refcnt);
+ mutex_unlock(&flow_indr_block_lock);
+ return 0;
+ }
+ }
- indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
- if (!indr_block_cb)
- return ERR_PTR(-ENOMEM);
+ indr_dev = flow_indr_dev_alloc(cb, cb_priv);
+ if (!indr_dev) {
+ mutex_unlock(&flow_indr_block_lock);
+ return -ENOMEM;
+ }
+
+ list_add(&indr_dev->list, &flow_block_indr_dev_list);
+ existing_qdiscs_register(cb, cb_priv);
+ mutex_unlock(&flow_indr_block_lock);
- indr_block_cb->cb_priv = cb_priv;
- indr_block_cb->cb = cb;
- indr_block_cb->cb_ident = cb_ident;
- list_add(&indr_block_cb->list, &indr_dev->cb_list);
+ tcf_action_reoffload_cb(cb, cb_priv, true);
- return indr_block_cb;
+ return 0;
}
+EXPORT_SYMBOL(flow_indr_dev_register);
-static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb)
+static void __flow_block_indr_cleanup(void (*release)(void *cb_priv),
+ void *cb_priv,
+ struct list_head *cleanup_list)
{
- list_del(&indr_block_cb->list);
- kfree(indr_block_cb);
-}
+ struct flow_block_cb *this, *next;
-static DEFINE_MUTEX(flow_indr_block_cb_lock);
+ list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) {
+ if (this->release == release &&
+ this->indr.cb_priv == cb_priv)
+ list_move(&this->indr.list, cleanup_list);
+ }
+}
-static void flow_block_cmd(struct net_device *dev,
- flow_indr_block_bind_cb_t *cb, void *cb_priv,
- enum flow_block_command command)
+static void flow_block_indr_notify(struct list_head *cleanup_list)
{
- struct flow_indr_block_entry *entry;
+ struct flow_block_cb *this, *next;
- mutex_lock(&flow_indr_block_cb_lock);
- list_for_each_entry(entry, &block_cb_list, list) {
- entry->cb(dev, cb, cb_priv, command);
+ list_for_each_entry_safe(this, next, cleanup_list, indr.list) {
+ list_del(&this->indr.list);
+ this->indr.cleanup(this);
}
- mutex_unlock(&flow_indr_block_cb_lock);
}
-int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- flow_indr_block_bind_cb_t *cb,
- void *cb_ident)
+void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv,
+ void (*release)(void *cb_priv))
{
- struct flow_indr_block_cb *indr_block_cb;
- struct flow_indr_block_dev *indr_dev;
- int err;
-
- indr_dev = flow_indr_block_dev_get(dev);
- if (!indr_dev)
- return -ENOMEM;
+ struct flow_indr_dev *this, *next, *indr_dev = NULL;
+ LIST_HEAD(cleanup_list);
- indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
- err = PTR_ERR_OR_ZERO(indr_block_cb);
- if (err)
- goto err_dev_put;
+ mutex_lock(&flow_indr_block_lock);
+ list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) {
+ if (this->cb == cb &&
+ this->cb_priv == cb_priv &&
+ refcount_dec_and_test(&this->refcnt)) {
+ indr_dev = this;
+ list_del(&indr_dev->list);
+ break;
+ }
+ }
- flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
- FLOW_BLOCK_BIND);
+ if (!indr_dev) {
+ mutex_unlock(&flow_indr_block_lock);
+ return;
+ }
- return 0;
+ __flow_block_indr_cleanup(release, cb_priv, &cleanup_list);
+ mutex_unlock(&flow_indr_block_lock);
-err_dev_put:
- flow_indr_block_dev_put(indr_dev);
- return err;
+ tcf_action_reoffload_cb(cb, cb_priv, false);
+ flow_block_indr_notify(&cleanup_list);
+ kfree(indr_dev);
}
-EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register);
+EXPORT_SYMBOL(flow_indr_dev_unregister);
-int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
- flow_indr_block_bind_cb_t *cb,
- void *cb_ident)
+static void flow_block_indr_init(struct flow_block_cb *flow_block,
+ struct flow_block_offload *bo,
+ struct net_device *dev, struct Qdisc *sch, void *data,
+ void *cb_priv,
+ void (*cleanup)(struct flow_block_cb *block_cb))
{
- int err;
-
- rtnl_lock();
- err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
- rtnl_unlock();
-
- return err;
+ flow_block->indr.binder_type = bo->binder_type;
+ flow_block->indr.data = data;
+ flow_block->indr.cb_priv = cb_priv;
+ flow_block->indr.dev = dev;
+ flow_block->indr.sch = sch;
+ flow_block->indr.cleanup = cleanup;
}
-EXPORT_SYMBOL_GPL(flow_indr_block_cb_register);
-void __flow_indr_block_cb_unregister(struct net_device *dev,
- flow_indr_block_bind_cb_t *cb,
- void *cb_ident)
+struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb,
+ void *cb_ident, void *cb_priv,
+ void (*release)(void *cb_priv),
+ struct flow_block_offload *bo,
+ struct net_device *dev,
+ struct Qdisc *sch, void *data,
+ void *indr_cb_priv,
+ void (*cleanup)(struct flow_block_cb *block_cb))
{
- struct flow_indr_block_cb *indr_block_cb;
- struct flow_indr_block_dev *indr_dev;
-
- indr_dev = flow_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
+ struct flow_block_cb *block_cb;
- indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
- if (!indr_block_cb)
- return;
+ block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, release);
+ if (IS_ERR(block_cb))
+ goto out;
- flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv,
- FLOW_BLOCK_UNBIND);
+ flow_block_indr_init(block_cb, bo, dev, sch, data, indr_cb_priv, cleanup);
+ list_add(&block_cb->indr.list, &flow_block_indr_list);
- flow_indr_block_cb_del(indr_block_cb);
- flow_indr_block_dev_put(indr_dev);
+out:
+ return block_cb;
}
-EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister);
+EXPORT_SYMBOL(flow_indr_block_cb_alloc);
-void flow_indr_block_cb_unregister(struct net_device *dev,
- flow_indr_block_bind_cb_t *cb,
- void *cb_ident)
+static struct flow_indir_dev_info *find_indir_dev(void *data)
{
- rtnl_lock();
- __flow_indr_block_cb_unregister(dev, cb, cb_ident);
- rtnl_unlock();
+ struct flow_indir_dev_info *cur;
+
+ list_for_each_entry(cur, &flow_indir_dev_list, list) {
+ if (cur->data == data)
+ return cur;
+ }
+ return NULL;
}
-EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
-void flow_indr_block_call(struct net_device *dev,
- struct flow_block_offload *bo,
- enum flow_block_command command)
+static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch,
+ enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb),
+ struct flow_block_offload *bo)
{
- struct flow_indr_block_cb *indr_block_cb;
- struct flow_indr_block_dev *indr_dev;
+ struct flow_indir_dev_info *info;
- indr_dev = flow_indr_block_dev_lookup(dev);
- if (!indr_dev)
- return;
+ info = find_indir_dev(data);
+ if (info)
+ return -EEXIST;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->data = data;
+ info->dev = dev;
+ info->sch = sch;
+ info->type = type;
+ info->cleanup = cleanup;
+ info->command = bo->command;
+ info->binder_type = bo->binder_type;
+ info->cb_list = bo->cb_list_head;
- list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- bo);
+ list_add(&info->list, &flow_indir_dev_list);
+ return 0;
}
-EXPORT_SYMBOL_GPL(flow_indr_block_call);
-void flow_indr_add_block_cb(struct flow_indr_block_entry *entry)
+static int indir_dev_remove(void *data)
{
- mutex_lock(&flow_indr_block_cb_lock);
- list_add_tail(&entry->list, &block_cb_list);
- mutex_unlock(&flow_indr_block_cb_lock);
+ struct flow_indir_dev_info *info;
+
+ info = find_indir_dev(data);
+ if (!info)
+ return -ENOENT;
+
+ list_del(&info->list);
+
+ kfree(info);
+ return 0;
}
-EXPORT_SYMBOL_GPL(flow_indr_add_block_cb);
-void flow_indr_del_block_cb(struct flow_indr_block_entry *entry)
+int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
+ enum tc_setup_type type, void *data,
+ struct flow_block_offload *bo,
+ void (*cleanup)(struct flow_block_cb *block_cb))
{
- mutex_lock(&flow_indr_block_cb_lock);
- list_del(&entry->list);
- mutex_unlock(&flow_indr_block_cb_lock);
+ struct flow_indr_dev *this;
+ u32 count = 0;
+ int err;
+
+ mutex_lock(&flow_indr_block_lock);
+ if (bo) {
+ if (bo->command == FLOW_BLOCK_BIND)
+ indir_dev_add(data, dev, sch, type, cleanup, bo);
+ else if (bo->command == FLOW_BLOCK_UNBIND)
+ indir_dev_remove(data);
+ }
+
+ list_for_each_entry(this, &flow_block_indr_dev_list, list) {
+ err = this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup);
+ if (!err)
+ count++;
+ }
+
+ mutex_unlock(&flow_indr_block_lock);
+
+ return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count;
}
-EXPORT_SYMBOL_GPL(flow_indr_del_block_cb);
+EXPORT_SYMBOL(flow_indr_dev_setup_offload);
-static int __init init_flow_indr_rhashtable(void)
+bool flow_indr_dev_exists(void)
{
- return rhashtable_init(&indr_setup_block_ht,
- &flow_indr_setup_block_ht_params);
+ return !list_empty(&flow_block_indr_dev_list);
}
-subsys_initcall(init_flow_indr_rhashtable);
+EXPORT_SYMBOL(flow_indr_dev_exists);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 80dbf2f4016e..4fcbdd71c59f 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -40,10 +40,10 @@
*/
struct net_rate_estimator {
- struct gnet_stats_basic_packed *bstats;
+ struct gnet_stats_basic_sync *bstats;
spinlock_t *stats_lock;
- seqcount_t *running;
- struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ bool running;
+ struct gnet_stats_basic_sync __percpu *cpu_bstats;
u8 ewma_log;
u8 intvl_log; /* period : (250ms << intvl_log) */
@@ -60,13 +60,13 @@ struct net_rate_estimator {
};
static void est_fetch_counters(struct net_rate_estimator *e,
- struct gnet_stats_basic_packed *b)
+ struct gnet_stats_basic_sync *b)
{
- memset(b, 0, sizeof(*b));
+ gnet_stats_basic_sync_init(b);
if (e->stats_lock)
spin_lock(e->stats_lock);
- __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
+ gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running);
if (e->stats_lock)
spin_unlock(e->stats_lock);
@@ -76,23 +76,27 @@ static void est_fetch_counters(struct net_rate_estimator *e,
static void est_timer(struct timer_list *t)
{
struct net_rate_estimator *est = from_timer(est, t, timer);
- struct gnet_stats_basic_packed b;
+ struct gnet_stats_basic_sync b;
+ u64 b_bytes, b_packets;
u64 rate, brate;
est_fetch_counters(est, &b);
- brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log);
- brate -= (est->avbps >> est->ewma_log);
+ b_bytes = u64_stats_read(&b.bytes);
+ b_packets = u64_stats_read(&b.packets);
- rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
- rate -= (est->avpps >> est->ewma_log);
+ brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log);
+ brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log);
+
+ rate = (b_packets - est->last_packets) << (10 - est->intvl_log);
+ rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
write_seqcount_begin(&est->seq);
est->avbps += brate;
est->avpps += rate;
write_seqcount_end(&est->seq);
- est->last_bytes = b.bytes;
- est->last_packets = b.packets;
+ est->last_bytes = b_bytes;
+ est->last_packets = b_packets;
est->next_jiffies += ((HZ/4) << est->intvl_log);
@@ -109,7 +113,9 @@ static void est_timer(struct timer_list *t)
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @lock: lock for statistics and control path
- * @running: qdisc running seqcount
+ * @running: true if @bstats represents a running qdisc, thus @bstats'
+ * internal values might change during basic reads. Only used
+ * if @bstats_cpu is NULL
* @opt: rate estimator configuration TLV
*
* Creates a new rate estimator with &bstats as source and &rate_est
@@ -121,16 +127,16 @@ static void est_timer(struct timer_list *t)
* Returns 0 on success or a negative error code.
*
*/
-int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_basic_cpu __percpu *cpu_bstats,
+int gen_new_estimator(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running,
+ bool running,
struct nlattr *opt)
{
struct gnet_estimator *parm = nla_data(opt);
struct net_rate_estimator *old, *est;
- struct gnet_stats_basic_packed b;
+ struct gnet_stats_basic_sync b;
int intvl_log;
if (nla_len(opt) < sizeof(*parm))
@@ -143,6 +149,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
+ if (parm->ewma_log == 0 || parm->ewma_log >= 31)
+ return -EINVAL;
+
est = kzalloc(sizeof(*est), GFP_KERNEL);
if (!est)
return -ENOBUFS;
@@ -161,8 +170,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
est_fetch_counters(est, &b);
if (lock)
local_bh_enable();
- est->last_bytes = b.bytes;
- est->last_packets = b.packets;
+ est->last_bytes = u64_stats_read(&b.bytes);
+ est->last_packets = u64_stats_read(&b.packets);
if (lock)
spin_lock_bh(lock);
@@ -211,7 +220,9 @@ EXPORT_SYMBOL(gen_kill_estimator);
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @lock: lock for statistics and control path
- * @running: qdisc running seqcount (might be NULL)
+ * @running: true if @bstats represents a running qdisc, thus @bstats'
+ * internal values might change during basic reads. Only used
+ * if @cpu_bstats is NULL
* @opt: rate estimator configuration TLV
*
* Replaces the configuration of a rate estimator by calling
@@ -219,11 +230,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
*
* Returns 0 on success or a negative error code.
*/
-int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_basic_cpu __percpu *cpu_bstats,
+int gen_replace_estimator(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running, struct nlattr *opt)
+ bool running, struct nlattr *opt)
{
return gen_new_estimator(bstats, cpu_bstats, rate_est,
lock, running, opt);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 1d653fbfcf52..c8d137ef5980 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -6,7 +6,7 @@
* Jamal Hadi Salim
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
- * See Documentation/networking/gen_stats.txt
+ * See Documentation/networking/gen_stats.rst
*/
#include <linux/types.h>
@@ -18,7 +18,7 @@
#include <linux/gen_stats.h>
#include <net/netlink.h>
#include <net/gen_stats.h>
-
+#include <net/sch_generic.h>
static inline int
gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
@@ -114,63 +114,112 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
}
EXPORT_SYMBOL(gnet_stats_start_copy);
-static void
-__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_basic_cpu __percpu *cpu)
+/* Must not be inlined, due to u64_stats seqcount_t lockdep key */
+void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b)
{
+ u64_stats_set(&b->bytes, 0);
+ u64_stats_set(&b->packets, 0);
+ u64_stats_init(&b->syncp);
+}
+EXPORT_SYMBOL(gnet_stats_basic_sync_init);
+
+static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu)
+{
+ u64 t_bytes = 0, t_packets = 0;
int i;
for_each_possible_cpu(i) {
- struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
+ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i);
unsigned int start;
u64 bytes, packets;
do {
start = u64_stats_fetch_begin_irq(&bcpu->syncp);
- bytes = bcpu->bstats.bytes;
- packets = bcpu->bstats.packets;
+ bytes = u64_stats_read(&bcpu->bytes);
+ packets = u64_stats_read(&bcpu->packets);
} while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
- bstats->bytes += bytes;
- bstats->packets += packets;
+ t_bytes += bytes;
+ t_packets += packets;
+ }
+ _bstats_update(bstats, t_bytes, t_packets);
+}
+
+void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b, bool running)
+{
+ unsigned int start;
+ u64 bytes = 0;
+ u64 packets = 0;
+
+ WARN_ON_ONCE((cpu || running) && in_hardirq());
+
+ if (cpu) {
+ gnet_stats_add_basic_cpu(bstats, cpu);
+ return;
}
+ do {
+ if (running)
+ start = u64_stats_fetch_begin_irq(&b->syncp);
+ bytes = u64_stats_read(&b->bytes);
+ packets = u64_stats_read(&b->packets);
+ } while (running && u64_stats_fetch_retry_irq(&b->syncp, start));
+
+ _bstats_update(bstats, bytes, packets);
}
+EXPORT_SYMBOL(gnet_stats_add_basic);
-void
-__gnet_stats_copy_basic(const seqcount_t *running,
- struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_basic_cpu __percpu *cpu,
- struct gnet_stats_basic_packed *b)
+static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b, bool running)
{
- unsigned int seq;
+ unsigned int start;
if (cpu) {
- __gnet_stats_copy_basic_cpu(bstats, cpu);
+ u64 t_bytes = 0, t_packets = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i);
+ unsigned int start;
+ u64 bytes, packets;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&bcpu->syncp);
+ bytes = u64_stats_read(&bcpu->bytes);
+ packets = u64_stats_read(&bcpu->packets);
+ } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+
+ t_bytes += bytes;
+ t_packets += packets;
+ }
+ *ret_bytes = t_bytes;
+ *ret_packets = t_packets;
return;
}
do {
if (running)
- seq = read_seqcount_begin(running);
- bstats->bytes = b->bytes;
- bstats->packets = b->packets;
- } while (running && read_seqcount_retry(running, seq));
+ start = u64_stats_fetch_begin_irq(&b->syncp);
+ *ret_bytes = u64_stats_read(&b->bytes);
+ *ret_packets = u64_stats_read(&b->packets);
+ } while (running && u64_stats_fetch_retry_irq(&b->syncp, start));
}
-EXPORT_SYMBOL(__gnet_stats_copy_basic);
static int
-___gnet_stats_copy_basic(const seqcount_t *running,
- struct gnet_dump *d,
- struct gnet_stats_basic_cpu __percpu *cpu,
- struct gnet_stats_basic_packed *b,
- int type)
+___gnet_stats_copy_basic(struct gnet_dump *d,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b,
+ int type, bool running)
{
- struct gnet_stats_basic_packed bstats = {0};
+ u64 bstats_bytes, bstats_packets;
- __gnet_stats_copy_basic(running, &bstats, cpu, b);
+ gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running);
if (d->compat_tc_stats && type == TCA_STATS_BASIC) {
- d->tc_stats.bytes = bstats.bytes;
- d->tc_stats.packets = bstats.packets;
+ d->tc_stats.bytes = bstats_bytes;
+ d->tc_stats.packets = bstats_packets;
}
if (d->tail) {
@@ -178,24 +227,28 @@ ___gnet_stats_copy_basic(const seqcount_t *running,
int res;
memset(&sb, 0, sizeof(sb));
- sb.bytes = bstats.bytes;
- sb.packets = bstats.packets;
+ sb.bytes = bstats_bytes;
+ sb.packets = bstats_packets;
res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD);
- if (res < 0 || sb.packets == bstats.packets)
+ if (res < 0 || sb.packets == bstats_packets)
return res;
/* emit 64bit stats only if needed */
- return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets,
- sizeof(bstats.packets), TCA_STATS_PAD);
+ return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets,
+ sizeof(bstats_packets), TCA_STATS_PAD);
}
return 0;
}
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
- * @running: seqcount_t pointer
* @d: dumping handle
* @cpu: copy statistic per cpu
* @b: basic statistics
+ * @running: true if @b represents a running qdisc, thus @b's
+ * internal values might change during basic reads.
+ * Only used if @cpu is NULL
+ *
+ * Context: task; must not be run from IRQ or BH contexts
*
* Appends the basic statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -204,22 +257,25 @@ ___gnet_stats_copy_basic(const seqcount_t *running,
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(const seqcount_t *running,
- struct gnet_dump *d,
- struct gnet_stats_basic_cpu __percpu *cpu,
- struct gnet_stats_basic_packed *b)
+gnet_stats_copy_basic(struct gnet_dump *d,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b,
+ bool running)
{
- return ___gnet_stats_copy_basic(running, d, cpu, b,
- TCA_STATS_BASIC);
+ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running);
}
EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV
- * @running: seqcount_t pointer
* @d: dumping handle
* @cpu: copy statistic per cpu
* @b: basic statistics
+ * @running: true if @b represents a running qdisc, thus @b's
+ * internal values might change during basic reads.
+ * Only used if @cpu is NULL
+ *
+ * Context: task; must not be run from IRQ or BH contexts
*
* Appends the basic statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -228,13 +284,12 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic_hw(const seqcount_t *running,
- struct gnet_dump *d,
- struct gnet_stats_basic_cpu __percpu *cpu,
- struct gnet_stats_basic_packed *b)
+gnet_stats_copy_basic_hw(struct gnet_dump *d,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b,
+ bool running)
{
- return ___gnet_stats_copy_basic(running, d, cpu, b,
- TCA_STATS_BASIC_HW);
+ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running);
}
EXPORT_SYMBOL(gnet_stats_copy_basic_hw);
@@ -282,16 +337,15 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
}
EXPORT_SYMBOL(gnet_stats_copy_rate_est);
-static void
-__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
- const struct gnet_stats_queue __percpu *q)
+static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *q)
{
int i;
for_each_possible_cpu(i) {
const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
- qstats->qlen = 0;
+ qstats->qlen += qcpu->qlen;
qstats->backlog += qcpu->backlog;
qstats->drops += qcpu->drops;
qstats->requeues += qcpu->requeues;
@@ -299,24 +353,21 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
}
}
-void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
- const struct gnet_stats_queue __percpu *cpu,
- const struct gnet_stats_queue *q,
- __u32 qlen)
+void gnet_stats_add_queue(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *cpu,
+ const struct gnet_stats_queue *q)
{
if (cpu) {
- __gnet_stats_copy_queue_cpu(qstats, cpu);
+ gnet_stats_add_queue_cpu(qstats, cpu);
} else {
- qstats->qlen = q->qlen;
- qstats->backlog = q->backlog;
- qstats->drops = q->drops;
- qstats->requeues = q->requeues;
- qstats->overlimits = q->overlimits;
+ qstats->qlen += q->qlen;
+ qstats->backlog += q->backlog;
+ qstats->drops += q->drops;
+ qstats->requeues += q->requeues;
+ qstats->overlimits += q->overlimits;
}
-
- qstats->qlen = qlen;
}
-EXPORT_SYMBOL(__gnet_stats_copy_queue);
+EXPORT_SYMBOL(gnet_stats_add_queue);
/**
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
@@ -339,7 +390,8 @@ gnet_stats_copy_queue(struct gnet_dump *d,
{
struct gnet_stats_queue qstats = {0};
- __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
+ gnet_stats_add_queue(&qstats, cpu_q, q);
+ qstats.qlen = qlen;
if (d->compat_tc_stats) {
d->tc_stats.drops = qstats.drops;
diff --git a/net/core/gro.c b/net/core/gro.c
new file mode 100644
index 000000000000..bc9451743307
--- /dev/null
+++ b/net/core/gro.c
@@ -0,0 +1,805 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <net/gro.h>
+#include <net/dst_metadata.h>
+#include <net/busy_poll.h>
+#include <trace/events/net.h>
+
+#define MAX_GRO_SKBS 8
+
+/* This should be increased if a protocol with a bigger head is added. */
+#define GRO_MAX_HEAD (MAX_HEADER + 128)
+
+static DEFINE_SPINLOCK(offload_lock);
+static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base);
+/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
+int gro_normal_batch __read_mostly = 8;
+
+/**
+ * dev_add_offload - register offload handlers
+ * @po: protocol offload declaration
+ *
+ * Add protocol offload handlers to the networking stack. The passed
+ * &proto_offload is linked into kernel lists and may not be freed until
+ * it has been removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new offload handlers (until the next received packet).
+ */
+void dev_add_offload(struct packet_offload *po)
+{
+ struct packet_offload *elem;
+
+ spin_lock(&offload_lock);
+ list_for_each_entry(elem, &offload_base, list) {
+ if (po->priority < elem->priority)
+ break;
+ }
+ list_add_rcu(&po->list, elem->list.prev);
+ spin_unlock(&offload_lock);
+}
+EXPORT_SYMBOL(dev_add_offload);
+
+/**
+ * __dev_remove_offload - remove offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a protocol offload handler that was previously added to the
+ * kernel offload handlers by dev_add_offload(). The passed &offload_type
+ * is removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+static void __dev_remove_offload(struct packet_offload *po)
+{
+ struct list_head *head = &offload_base;
+ struct packet_offload *po1;
+
+ spin_lock(&offload_lock);
+
+ list_for_each_entry(po1, head, list) {
+ if (po == po1) {
+ list_del_rcu(&po->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_offload: %p not found\n", po);
+out:
+ spin_unlock(&offload_lock);
+}
+
+/**
+ * dev_remove_offload - remove packet offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a packet offload handler that was previously added to the kernel
+ * offload handlers by dev_add_offload(). The passed &offload_type is
+ * removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_offload(struct packet_offload *po)
+{
+ __dev_remove_offload(po);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_offload);
+
+/**
+ * skb_eth_gso_segment - segmentation handler for ethernet protocols.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @type: Ethernet Protocol ID
+ */
+struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, __be16 type)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_eth_gso_segment);
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, vlan_depth);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
+{
+ struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
+ unsigned int offset = skb_gro_offset(skb);
+ unsigned int headlen = skb_headlen(skb);
+ unsigned int len = skb_gro_len(skb);
+ unsigned int delta_truesize;
+ unsigned int gro_max_size;
+ unsigned int new_truesize;
+ struct sk_buff *lp;
+ int segs;
+
+ /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
+ gro_max_size = READ_ONCE(p->dev->gro_max_size);
+
+ if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
+ return -E2BIG;
+
+ if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
+ if (p->protocol != htons(ETH_P_IPV6) ||
+ skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
+ ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
+ p->encapsulation)
+ return -E2BIG;
+ }
+
+ segs = NAPI_GRO_CB(skb)->count;
+ lp = NAPI_GRO_CB(p)->last;
+ pinfo = skb_shinfo(lp);
+
+ if (headlen <= offset) {
+ skb_frag_t *frag;
+ skb_frag_t *frag2;
+ int i = skbinfo->nr_frags;
+ int nr_frags = pinfo->nr_frags + i;
+
+ if (nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ offset -= headlen;
+ pinfo->nr_frags = nr_frags;
+ skbinfo->nr_frags = 0;
+
+ frag = pinfo->frags + nr_frags;
+ frag2 = skbinfo->frags + i;
+ do {
+ *--frag = *--frag2;
+ } while (--i);
+
+ skb_frag_off_add(frag, offset);
+ skb_frag_size_sub(frag, offset);
+
+ /* all fragments truesize : remove (head size + sk_buff) */
+ new_truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ delta_truesize = skb->truesize - new_truesize;
+
+ skb->truesize = new_truesize;
+ skb->len -= skb->data_len;
+ skb->data_len = 0;
+
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ __skb_frag_set_page(frag, page);
+ skb_frag_off_set(frag, first_offset);
+ skb_frag_size_set(frag, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ delta_truesize = skb->truesize - new_truesize;
+ skb->truesize = new_truesize;
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
+ goto done;
+ }
+
+merge:
+ /* sk owenrship - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ delta_truesize = skb->truesize;
+ if (offset > headlen) {
+ unsigned int eat = offset - headlen;
+
+ skb_frag_off_add(&skbinfo->frags[0], eat);
+ skb_frag_size_sub(&skbinfo->frags[0], eat);
+ skb->data_len -= eat;
+ skb->len -= eat;
+ offset = headlen;
+ }
+
+ __skb_pull(skb, offset);
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+ NAPI_GRO_CB(p)->last = skb;
+ __skb_header_release(skb);
+ lp = p;
+
+done:
+ NAPI_GRO_CB(p)->count += segs;
+ p->data_len += len;
+ p->truesize += delta_truesize;
+ p->len += len;
+ if (lp != p) {
+ lp->data_len += len;
+ lp->truesize += delta_truesize;
+ lp->len += len;
+ }
+ NAPI_GRO_CB(skb)->same_flow = 1;
+ return 0;
+}
+
+
+static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
+{
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct list_head *head = &offload_base;
+ int err = -ENOENT;
+
+ BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
+
+ if (NAPI_GRO_CB(skb)->count == 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, 0);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (err) {
+ WARN_ON(&ptype->list == head);
+ kfree_skb(skb);
+ return;
+ }
+
+out:
+ gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
+}
+
+static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
+ bool flush_old)
+{
+ struct list_head *head = &napi->gro_hash[index].list;
+ struct sk_buff *skb, *p;
+
+ list_for_each_entry_safe_reverse(skb, p, head, list) {
+ if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+ return;
+ skb_list_del_init(skb);
+ napi_gro_complete(napi, skb);
+ napi->gro_hash[index].count--;
+ }
+
+ if (!napi->gro_hash[index].count)
+ __clear_bit(index, &napi->gro_bitmask);
+}
+
+/* napi->gro_hash[].list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+ unsigned long bitmask = napi->gro_bitmask;
+ unsigned int i, base = ~0U;
+
+ while ((i = ffs(bitmask)) != 0) {
+ bitmask >>= i;
+ base += i;
+ __napi_gro_flush_chain(napi, base, flush_old);
+ }
+}
+EXPORT_SYMBOL(napi_gro_flush);
+
+static void gro_list_prepare(const struct list_head *head,
+ const struct sk_buff *skb)
+{
+ unsigned int maclen = skb->dev->hard_header_len;
+ u32 hash = skb_get_hash_raw(skb);
+ struct sk_buff *p;
+
+ list_for_each_entry(p, head, list) {
+ unsigned long diffs;
+
+ NAPI_GRO_CB(p)->flush = 0;
+
+ if (hash != skb_get_hash_raw(p)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
+ if (skb_vlan_tag_present(p))
+ diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
+ diffs |= skb_metadata_differs(p, skb);
+ if (maclen == ETH_HLEN)
+ diffs |= compare_ether_header(skb_mac_header(p),
+ skb_mac_header(skb));
+ else if (!diffs)
+ diffs = memcmp(skb_mac_header(p),
+ skb_mac_header(skb),
+ maclen);
+
+ /* in most common scenarions 'slow_gro' is 0
+ * otherwise we are already on some slower paths
+ * either skip all the infrequent tests altogether or
+ * avoid trying too hard to skip each of them individually
+ */
+ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) {
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ struct tc_skb_ext *skb_ext;
+ struct tc_skb_ext *p_ext;
+#endif
+
+ diffs |= p->sk != skb->sk;
+ diffs |= skb_metadata_dst_cmp(p, skb);
+ diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
+
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+ p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+ diffs |= (!!p_ext) ^ (!!skb_ext);
+ if (!diffs && unlikely(skb_ext))
+ diffs |= p_ext->chain ^ skb_ext->chain;
+#endif
+ }
+
+ NAPI_GRO_CB(p)->same_flow = !diffs;
+ }
+}
+
+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
+{
+ const struct skb_shared_info *pinfo = skb_shinfo(skb);
+ const skb_frag_t *frag0 = &pinfo->frags[0];
+
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ NAPI_GRO_CB(skb)->frag0 = NULL;
+ NAPI_GRO_CB(skb)->frag0_len = 0;
+
+ if (!skb_headlen(skb) && pinfo->nr_frags &&
+ !PageHighMem(skb_frag_page(frag0)) &&
+ (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
+ NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
+ skb_frag_size(frag0),
+ skb->end - skb->tail);
+ }
+}
+
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ skb_frag_off_add(&pinfo->frags[0], grow);
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
+static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
+{
+ struct sk_buff *oldest;
+
+ oldest = list_last_entry(head, struct sk_buff, list);
+
+ /* We are called with head length >= MAX_GRO_SKBS, so this is
+ * impossible.
+ */
+ if (WARN_ON_ONCE(!oldest))
+ return;
+
+ /* Do not adjust napi->gro_hash[].count, caller is adding a new
+ * SKB to the chain.
+ */
+ skb_list_del_init(oldest);
+ napi_gro_complete(napi, oldest);
+}
+
+static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct gro_list *gro_list = &napi->gro_hash[bucket];
+ struct list_head *head = &offload_base;
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct sk_buff *pp = NULL;
+ enum gro_result ret;
+ int same_flow;
+ int grow;
+
+ if (netif_elide_gro(skb->dev))
+ goto normal;
+
+ gro_list_prepare(&gro_list->list, skb);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+
+ skb_set_network_header(skb, skb_gro_offset(skb));
+ skb_reset_mac_len(skb);
+ BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
+ sizeof(u32))); /* Avoid slow unaligned acc */
+ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
+ NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
+ NAPI_GRO_CB(skb)->is_atomic = 1;
+ NAPI_GRO_CB(skb)->count = 1;
+ if (unlikely(skb_is_gso(skb))) {
+ NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
+ /* Only support TCP at the moment. */
+ if (!skb_is_gso_tcp(skb))
+ NAPI_GRO_CB(skb)->flush = 1;
+ }
+
+ /* Setup for GRO checksum validation */
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ NAPI_GRO_CB(skb)->csum = skb->csum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+ break;
+ case CHECKSUM_UNNECESSARY:
+ NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
+ break;
+ }
+
+ pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ &gro_list->list, skb);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (&ptype->list == head)
+ goto normal;
+
+ if (PTR_ERR(pp) == -EINPROGRESS) {
+ ret = GRO_CONSUMED;
+ goto ok;
+ }
+
+ same_flow = NAPI_GRO_CB(skb)->same_flow;
+ ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
+
+ if (pp) {
+ skb_list_del_init(pp);
+ napi_gro_complete(napi, pp);
+ gro_list->count--;
+ }
+
+ if (same_flow)
+ goto ok;
+
+ if (NAPI_GRO_CB(skb)->flush)
+ goto normal;
+
+ if (unlikely(gro_list->count >= MAX_GRO_SKBS))
+ gro_flush_oldest(napi, &gro_list->list);
+ else
+ gro_list->count++;
+
+ NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
+ if (!skb_is_gso(skb))
+ skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+ list_add(&skb->list, &gro_list->list);
+ ret = GRO_HELD;
+
+pull:
+ grow = skb_gro_offset(skb) - skb_headlen(skb);
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
+ok:
+ if (gro_list->count) {
+ if (!test_bit(bucket, &napi->gro_bitmask))
+ __set_bit(bucket, &napi->gro_bitmask);
+ } else if (test_bit(bucket, &napi->gro_bitmask)) {
+ __clear_bit(bucket, &napi->gro_bitmask);
+ }
+
+ return ret;
+
+normal:
+ ret = GRO_NORMAL;
+ goto pull;
+}
+
+struct packet_offload *gro_find_receive_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_receive_by_type);
+
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+ struct list_head *offload_head = &offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_complete_by_type);
+
+static gro_result_t napi_skb_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ gro_normal_one(napi, skb, 1);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __kfree_skb_defer(skb);
+ break;
+
+ case GRO_HELD:
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+{
+ gro_result_t ret;
+
+ skb_mark_napi_id(skb, napi);
+ trace_napi_gro_receive_entry(skb);
+
+ skb_gro_reset_offset(skb, 0);
+
+ ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
+ trace_napi_gro_receive_exit(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(napi_gro_receive);
+
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+{
+ if (unlikely(skb->pfmemalloc)) {
+ consume_skb(skb);
+ return;
+ }
+ __skb_pull(skb, skb_headlen(skb));
+ /* restore the reserve we had after netdev_alloc_skb_ip_align() */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
+ __vlan_hwaccel_clear_tag(skb);
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
+
+ /* eth_type_trans() assumes pkt_type is PACKET_HOST */
+ skb->pkt_type = PACKET_HOST;
+
+ skb->encapsulation = 0;
+ skb_shinfo(skb)->gso_type = 0;
+ skb_shinfo(skb)->gso_size = 0;
+ if (unlikely(skb->slow_gro)) {
+ skb_orphan(skb);
+ skb_ext_reset(skb);
+ nf_reset_ct(skb);
+ skb->slow_gro = 0;
+ }
+
+ napi->skb = skb;
+}
+
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+
+ if (!skb) {
+ skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
+ if (skb) {
+ napi->skb = skb;
+ skb_mark_napi_id(skb, napi);
+ }
+ }
+ return skb;
+}
+EXPORT_SYMBOL(napi_get_frags);
+
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (ret == GRO_NORMAL)
+ gro_normal_one(napi, skb, 1);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
+ napi_reuse_skb(napi, skb);
+ break;
+
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
+static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb, hlen);
+
+ if (unlikely(skb_gro_header_hard(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
+ if (unlikely(!eth)) {
+ net_warn_ratelimited("%s: dropping impossible skb from %s\n",
+ __func__, napi->dev->name);
+ napi_reuse_skb(napi, skb);
+ return NULL;
+ }
+ } else {
+ eth = (const struct ethhdr *)skb->data;
+ gro_pull_from_frag0(skb, hlen);
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
+ }
+ __skb_pull(skb, hlen);
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
+ */
+ skb->protocol = eth->h_proto;
+
+ return skb;
+}
+
+gro_result_t napi_gro_frags(struct napi_struct *napi)
+{
+ gro_result_t ret;
+ struct sk_buff *skb = napi_frags_skb(napi);
+
+ trace_napi_gro_frags_entry(skb);
+
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+ trace_napi_gro_frags_exit(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(napi_gro_frags);
+
+/* Compute the checksum from gro_offset and return the folded value
+ * after adding in any pseudo checksum.
+ */
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
+{
+ __wsum wsum;
+ __sum16 sum;
+
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
+
+ /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ /* See comments in __skb_checksum_complete(). */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+
+ NAPI_GRO_CB(skb)->csum = wsum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_gro_checksum_complete);
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index e095fb871d91..ed5ec5de47f6 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -26,9 +26,9 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
cell = this_cpu_ptr(gcells->cells);
- if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
+ if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) {
drop:
- atomic_long_inc(&dev->rx_dropped);
+ dev_core_stats_rx_dropped_inc(dev);
kfree_skb(skb);
res = NET_RX_DROP;
goto unlock;
@@ -81,16 +81,30 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
- netif_napi_add(dev, &cell->napi, gro_cell_poll,
- NAPI_POLL_WEIGHT);
+ netif_napi_add(dev, &cell->napi, gro_cell_poll);
napi_enable(&cell->napi);
}
return 0;
}
EXPORT_SYMBOL(gro_cells_init);
+struct percpu_free_defer {
+ struct rcu_head rcu;
+ void __percpu *ptr;
+};
+
+static void percpu_free_defer_callback(struct rcu_head *head)
+{
+ struct percpu_free_defer *defer;
+
+ defer = container_of(head, struct percpu_free_defer, rcu);
+ free_percpu(defer->ptr);
+ kfree(defer);
+}
+
void gro_cells_destroy(struct gro_cells *gcells)
{
+ struct percpu_free_defer *defer;
int i;
if (!gcells->cells)
@@ -99,10 +113,26 @@ void gro_cells_destroy(struct gro_cells *gcells)
struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
napi_disable(&cell->napi);
- netif_napi_del(&cell->napi);
+ __netif_napi_del(&cell->napi);
__skb_queue_purge(&cell->napi_skbs);
}
- free_percpu(gcells->cells);
+ /* We need to observe an rcu grace period before freeing ->cells,
+ * because netpoll could access dev->napi_list under rcu protection.
+ * Try hard using call_rcu() instead of synchronize_rcu(),
+ * because we might be called from cleanup_net(), and we
+ * definitely do not want to block this critical task.
+ */
+ defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN);
+ if (likely(defer)) {
+ defer->ptr = gcells->cells;
+ call_rcu(&defer->rcu, percpu_free_defer_callback);
+ } else {
+ /* We do not hold RTNL at this point, synchronize_net()
+ * would not be able to expedite this sync.
+ */
+ synchronize_rcu_expedited();
+ free_percpu(gcells->cells);
+ }
gcells->cells = NULL;
}
EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index f153e0601838..aa6cb1f90966 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -18,6 +18,7 @@
#include <linux/bitops.h>
#include <linux/types.h>
+#include "dev.h"
enum lw_bits {
LW_URGENT = 0,
@@ -34,6 +35,9 @@ static DEFINE_SPINLOCK(lweventlist_lock);
static unsigned char default_operstate(const struct net_device *dev)
{
+ if (netif_testing(dev))
+ return IF_OPER_TESTING;
+
if (!netif_carrier_ok(dev))
return (dev->ifindex != dev_get_iflink(dev) ?
IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
@@ -52,14 +56,18 @@ static void rfc2863_policy(struct net_device *dev)
if (operstate == dev->operstate)
return;
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
switch(dev->link_mode) {
+ case IF_LINK_MODE_TESTING:
+ if (operstate == IF_OPER_UP)
+ operstate = IF_OPER_TESTING;
+ break;
+
case IF_LINK_MODE_DORMANT:
if (operstate == IF_OPER_UP)
operstate = IF_OPER_DORMANT;
break;
-
case IF_LINK_MODE_DEFAULT:
default:
break;
@@ -67,14 +75,15 @@ static void rfc2863_policy(struct net_device *dev)
dev->operstate = operstate;
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
}
void linkwatch_init_dev(struct net_device *dev)
{
/* Handle pre-registration link state changes */
- if (!netif_carrier_ok(dev) || netif_dormant(dev))
+ if (!netif_carrier_ok(dev) || netif_dormant(dev) ||
+ netif_testing(dev))
rfc2863_policy(dev);
}
@@ -101,7 +110,7 @@ static void linkwatch_add_event(struct net_device *dev)
spin_lock_irqsave(&lweventlist_lock, flags);
if (list_empty(&dev->link_watch_list)) {
list_add_tail(&dev->link_watch_list, &lweventlist);
- dev_hold(dev);
+ netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
}
@@ -150,7 +159,7 @@ static void linkwatch_do_dev(struct net_device *dev)
clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
rfc2863_policy(dev);
- if (dev->flags & IFF_UP && netif_device_present(dev)) {
+ if (dev->flags & IFF_UP) {
if (netif_carrier_ok(dev))
dev_activate(dev);
else
@@ -158,7 +167,10 @@ static void linkwatch_do_dev(struct net_device *dev)
netdev_state_change(dev);
}
- dev_put(dev);
+ /* Note: our callers are responsible for calling netdev_tracker_free().
+ * This is the reason we use __dev_put() instead of dev_put().
+ */
+ __dev_put(dev);
}
static void __linkwatch_run_queue(int urgent_only)
@@ -196,10 +208,15 @@ static void __linkwatch_run_queue(int urgent_only)
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
- if (urgent_only && !linkwatch_urgent_event(dev)) {
+ if (!netif_device_present(dev) ||
+ (urgent_only && !linkwatch_urgent_event(dev))) {
list_add_tail(&dev->link_watch_list, &lweventlist);
continue;
}
+ /* We must free netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
spin_unlock_irq(&lweventlist_lock);
linkwatch_do_dev(dev);
do_dev--;
@@ -223,6 +240,10 @@ void linkwatch_forget_dev(struct net_device *dev)
if (!list_empty(&dev->link_watch_list)) {
list_del_init(&dev->link_watch_list);
clean = 1;
+ /* We must release netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
if (clean)
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 99a6de52b21d..8b6b5e72b217 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
*/
+#include <linux/filter.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/skbuff.h>
@@ -39,12 +40,11 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
{
int ret;
- /* Preempt disable is needed to protect per-cpu redirect_info between
- * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
- * access to maps strictly require a rcu_read_lock() for protection,
- * mixing with BH RCU lock doesn't work.
+ /* Migration disable and BH disable are needed to protect per-cpu
+ * redirect_info between BPF prog and skb_do_redirect().
*/
- preempt_disable();
+ migrate_disable();
+ local_bh_disable();
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
@@ -78,7 +78,8 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
break;
}
- preempt_enable();
+ local_bh_enable();
+ migrate_enable();
return ret;
}
@@ -158,10 +159,8 @@ static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
return dst->lwtstate->orig_output(net, sk, skb);
}
-static int xmit_check_hhlen(struct sk_buff *skb)
+static int xmit_check_hhlen(struct sk_buff *skb, int hh_len)
{
- int hh_len = skb_dst(skb)->dev->hard_header_len;
-
if (skb_headroom(skb) < hh_len) {
int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
@@ -273,6 +272,7 @@ static int bpf_xmit(struct sk_buff *skb)
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->xmit.prog) {
+ int hh_len = dst->dev->hard_header_len;
__be16 proto = skb->protocol;
int ret;
@@ -290,7 +290,7 @@ static int bpf_xmit(struct sk_buff *skb)
/* If the header was expanded, headroom might be too
* small for L2 header to come, expand as needed.
*/
- ret = xmit_check_hhlen(skb);
+ ret = xmit_check_hhlen(skb, hh_len);
if (unlikely(ret))
return ret;
@@ -367,7 +367,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
[LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
};
-static int bpf_build_state(struct nlattr *nla,
+static int bpf_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 2f9c0de533c7..6fac2f0ef074 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -23,6 +23,9 @@
#include <net/ip6_fib.h>
#include <net/rtnh.h>
+DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
+EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
+
#ifdef CONFIG_MODULES
static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
@@ -41,8 +44,13 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "BPF";
case LWTUNNEL_ENCAP_SEG6_LOCAL:
return "SEG6LOCAL";
+ case LWTUNNEL_ENCAP_RPL:
+ return "RPL";
+ case LWTUNNEL_ENCAP_IOAM6:
+ return "IOAM6";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
+ case LWTUNNEL_ENCAP_XFRM:
case LWTUNNEL_ENCAP_NONE:
case __LWTUNNEL_ENCAP_MAX:
/* should not have got here */
@@ -98,7 +106,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
}
EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
-int lwtunnel_build_state(u16 encap_type,
+int lwtunnel_build_state(struct net *net, u16 encap_type,
struct nlattr *encap, unsigned int family,
const void *cfg, struct lwtunnel_state **lws,
struct netlink_ext_ack *extack)
@@ -122,7 +130,7 @@ int lwtunnel_build_state(u16 encap_type,
rcu_read_unlock();
if (found) {
- ret = ops->build_state(encap, family, cfg, lws, extack);
+ ret = ops->build_state(net, encap, family, cfg, lws, extack);
if (ret)
module_put(ops->owner);
} else {
@@ -190,6 +198,10 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
if (nla_entype) {
+ if (nla_len(nla_entype) < sizeof(u16)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE");
+ return -EINVAL;
+ }
encap_type = nla_get_u16(nla_entype);
if (lwtunnel_valid_encap_type(encap_type,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 789a73aa7bd8..a77a85e357e0 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -41,7 +41,6 @@
#include <trace/events/neigh.h>
-#define DEBUG
#define NEIGH_DEBUG 1
#define neigh_dbg(level, fmt, ...) \
do { \
@@ -112,7 +111,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
unsigned long neigh_rand_reach_time(unsigned long base)
{
- return base ? (prandom_u32() % base) + (base >> 1) : 0;
+ return base ? prandom_u32_max(base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);
@@ -123,6 +122,8 @@ static void neigh_mark_dead(struct neighbour *n)
list_del_init(&n->gc_list);
atomic_dec(&n->tbl->gc_entries);
}
+ if (!list_empty(&n->managed_list))
+ list_del_init(&n->managed_list);
}
static void neigh_update_gc_list(struct neighbour *n)
@@ -131,6 +132,8 @@ static void neigh_update_gc_list(struct neighbour *n)
write_lock_bh(&n->tbl->lock);
write_lock(&n->lock);
+ if (n->dead)
+ goto out;
/* remove from the gc list if new state is permanent or if neighbor
* is externally learned; otherwise entry should be on the gc list
@@ -147,31 +150,59 @@ static void neigh_update_gc_list(struct neighbour *n)
list_add_tail(&n->gc_list, &n->tbl->gc_list);
atomic_inc(&n->tbl->gc_entries);
}
+out:
+ write_unlock(&n->lock);
+ write_unlock_bh(&n->tbl->lock);
+}
+
+static void neigh_update_managed_list(struct neighbour *n)
+{
+ bool on_managed_list, add_to_managed;
+
+ write_lock_bh(&n->tbl->lock);
+ write_lock(&n->lock);
+ if (n->dead)
+ goto out;
+
+ add_to_managed = n->flags & NTF_MANAGED;
+ on_managed_list = !list_empty(&n->managed_list);
+ if (!add_to_managed && on_managed_list)
+ list_del_init(&n->managed_list);
+ else if (add_to_managed && !on_managed_list)
+ list_add_tail(&n->managed_list, &n->tbl->managed_list);
+out:
write_unlock(&n->lock);
write_unlock_bh(&n->tbl->lock);
}
-static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags,
- int *notify)
+static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
+ bool *gc_update, bool *managed_update)
{
- bool rc = false;
- u8 ndm_flags;
+ u32 ndm_flags, old_flags = neigh->flags;
if (!(flags & NEIGH_UPDATE_F_ADMIN))
- return rc;
+ return;
- ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0;
- if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) {
+ ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0;
+ ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0;
+
+ if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) {
if (ndm_flags & NTF_EXT_LEARNED)
neigh->flags |= NTF_EXT_LEARNED;
else
neigh->flags &= ~NTF_EXT_LEARNED;
- rc = true;
*notify = 1;
+ *gc_update = true;
+ }
+ if ((old_flags ^ ndm_flags) & NTF_MANAGED) {
+ if (ndm_flags & NTF_MANAGED)
+ neigh->flags |= NTF_MANAGED;
+ else
+ neigh->flags &= ~NTF_MANAGED;
+ *notify = 1;
+ *managed_update = true;
}
-
- return rc;
}
static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
@@ -235,6 +266,9 @@ static int neigh_forced_gc(struct neigh_table *tbl)
write_lock(&n->lock);
if ((n->nud_state == NUD_FAILED) ||
+ (n->nud_state == NUD_NOARP) ||
+ (tbl->is_multicast &&
+ tbl->is_multicast(n->primary_key)) ||
time_after(tref, n->updated))
remove = true;
write_unlock(&n->lock);
@@ -273,11 +307,35 @@ static int neigh_del_timer(struct neighbour *n)
return 0;
}
-static void pneigh_queue_purge(struct sk_buff_head *list)
+static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net)
{
+ struct sk_buff_head tmp;
+ unsigned long flags;
struct sk_buff *skb;
- while ((skb = skb_dequeue(list)) != NULL) {
+ skb_queue_head_init(&tmp);
+ spin_lock_irqsave(&list->lock, flags);
+ skb = skb_peek(list);
+ while (skb != NULL) {
+ struct sk_buff *skb_next = skb_peek_next(skb, list);
+ struct net_device *dev = skb->dev;
+
+ if (net == NULL || net_eq(dev_net(dev), net)) {
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev)
+ in_dev->arp_parms->qlen--;
+ rcu_read_unlock();
+ __skb_unlink(skb, list);
+ __skb_queue_tail(&tmp, skb);
+ }
+ skb = skb_next;
+ }
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ while ((skb = __skb_dequeue(&tmp))) {
dev_put(skb->dev);
kfree_skb(skb);
}
@@ -351,9 +409,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev, skip_perm);
pneigh_ifdown_and_unlock(tbl, dev);
-
- del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+ pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL);
+ if (skb_queue_empty_lockless(&tbl->proxy_queue))
+ del_timer_sync(&tbl->proxy_timer);
return 0;
}
@@ -373,7 +431,7 @@ EXPORT_SYMBOL(neigh_ifdown);
static struct neighbour *neigh_alloc(struct neigh_table *tbl,
struct net_device *dev,
- bool exempt_from_gc)
+ u32 flags, bool exempt_from_gc)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
@@ -406,6 +464,7 @@ do_alloc:
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
+ n->flags = flags;
seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
timer_setup(&n->timer, neigh_timer_handler, 0);
@@ -415,6 +474,7 @@ do_alloc:
refcount_set(&n->refcnt, 1);
n->dead = 1;
INIT_LIST_HEAD(&n->gc_list);
+ INIT_LIST_HEAD(&n->managed_list);
atomic_inc(&tbl->entries);
out:
@@ -569,19 +629,18 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
}
EXPORT_SYMBOL(neigh_lookup_nodev);
-static struct neighbour *___neigh_create(struct neigh_table *tbl,
- const void *pkey,
- struct net_device *dev,
- bool exempt_from_gc, bool want_ref)
+static struct neighbour *
+___neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, u32 flags,
+ bool exempt_from_gc, bool want_ref)
{
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc);
- u32 hash_val;
- unsigned int key_len = tbl->key_len;
- int error;
+ u32 hash_val, key_len = tbl->key_len;
+ struct neighbour *n1, *rc, *n;
struct neigh_hash_table *nht;
+ int error;
+ n = neigh_alloc(tbl, dev, flags, exempt_from_gc);
trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
-
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
@@ -589,7 +648,7 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl,
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
- dev_hold(dev);
+ netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
@@ -644,7 +703,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl,
n->dead = 0;
if (!exempt_from_gc)
list_add_tail(&n->gc_list, &n->tbl->gc_list);
-
+ if (n->flags & NTF_MANAGED)
+ list_add_tail(&n->managed_list, &n->tbl->managed_list);
if (want_ref)
neigh_hold(n);
rcu_assign_pointer(n->next,
@@ -668,7 +728,7 @@ out_neigh_release:
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
- return ___neigh_create(tbl, pkey, dev, false, want_ref);
+ return ___neigh_create(tbl, pkey, dev, 0, false, want_ref);
}
EXPORT_SYMBOL(__neigh_create);
@@ -727,20 +787,17 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
ASSERT_RTNL();
- n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
+ n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL);
if (!n)
goto out;
- n->protocol = 0;
write_pnet(&n->net, net);
memcpy(n->key, pkey, key_len);
n->dev = dev;
- if (dev)
- dev_hold(dev);
+ netdev_hold(dev, &n->dev_tracker, GFP_KERNEL);
if (tbl->pconstructor && tbl->pconstructor(n)) {
- if (dev)
- dev_put(dev);
+ netdev_put(dev, &n->dev_tracker);
kfree(n);
n = NULL;
goto out;
@@ -772,8 +829,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
write_unlock_bh(&tbl->lock);
if (tbl->pdestructor)
tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
+ netdev_put(n->dev, &n->dev_tracker);
kfree(n);
return 0;
}
@@ -806,8 +862,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
n->next = NULL;
if (tbl->pdestructor)
tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
+ netdev_put(n->dev, &n->dev_tracker);
kfree(n);
}
return -ENOENT;
@@ -848,7 +903,7 @@ void neigh_destroy(struct neighbour *neigh)
if (dev->netdev_ops->ndo_neigh_destroy)
dev->netdev_ops->ndo_neigh_destroy(dev, neigh);
- dev_put(dev);
+ netdev_put(dev, &neigh->dev_tracker);
neigh_parms_put(neigh->parms);
neigh_dbg(2, "neigh %p is destroyed\n", neigh);
@@ -1065,11 +1120,12 @@ static void neigh_timer_handler(struct timer_list *t)
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
notify = 1;
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100);
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
@@ -1081,8 +1137,8 @@ static void neigh_timer_handler(struct timer_list *t)
}
if (neigh->nud_state & NUD_IN_TIMER) {
- if (time_before(next, jiffies + HZ/2))
- next = jiffies + HZ/2;
+ if (time_before(next, jiffies + HZ/100))
+ next = jiffies + HZ/100;
if (!mod_timer(&neigh->timer, next))
neigh_hold(neigh);
}
@@ -1101,7 +1157,8 @@ out:
neigh_release(neigh);
}
-int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
+ const bool immediate_ok)
{
int rc;
bool immediate_probe = false;
@@ -1122,18 +1179,23 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
atomic_set(&neigh->probes,
NEIGH_VAR(neigh->parms, UCAST_PROBES));
neigh_del_timer(neigh);
- neigh->nud_state = NUD_INCOMPLETE;
+ neigh->nud_state = NUD_INCOMPLETE;
neigh->updated = now;
- next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
- HZ/2);
+ if (!immediate_ok) {
+ next = now + 1;
+ } else {
+ immediate_probe = true;
+ next = now + max(NEIGH_VAR(neigh->parms,
+ RETRANS_TIME),
+ HZ / 100);
+ }
neigh_add_timer(neigh, next);
- immediate_probe = true;
} else {
neigh->nud_state = NUD_FAILED;
neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
@@ -1155,7 +1217,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
if (!buff)
break;
neigh->arp_queue_len_bytes -= buff->truesize;
- kfree_skb(buff);
+ kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
@@ -1177,7 +1239,7 @@ out_dead:
if (neigh->nud_state & NUD_STALE)
goto out_unlock_bh;
write_unlock_bh(&neigh->lock);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD);
trace_neigh_event_send_dead(neigh, 1);
return 1;
}
@@ -1202,8 +1264,6 @@ static void neigh_update_hhs(struct neighbour *neigh)
}
}
-
-
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
@@ -1214,7 +1274,8 @@ static void neigh_update_hhs(struct neighbour *neigh)
lladdr instead of overriding it
if it is different.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
-
+ NEIGH_UPDATE_F_USE means that the entry is user triggered.
+ NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed.
NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
NTF_ROUTER flag.
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
@@ -1222,17 +1283,15 @@ static void neigh_update_hhs(struct neighbour *neigh)
Caller MUST hold reference count on the entry.
*/
-
static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
u8 new, u32 flags, u32 nlmsg_pid,
struct netlink_ext_ack *extack)
{
- bool ext_learn_change = false;
- u8 old;
- int err;
- int notify = 0;
- struct net_device *dev;
+ bool gc_update = false, managed_update = false;
int update_isrouter = 0;
+ struct net_device *dev;
+ int err, notify = 0;
+ u8 old;
trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid);
@@ -1242,15 +1301,22 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
old = neigh->nud_state;
err = -EPERM;
- if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
- (old & (NUD_NOARP | NUD_PERMANENT)))
- goto out;
if (neigh->dead) {
NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");
+ new = old;
goto out;
}
+ if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
+ (old & (NUD_NOARP | NUD_PERMANENT)))
+ goto out;
- ext_learn_change = neigh_update_ext_learned(neigh, flags, &notify);
+ neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update);
+ if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) {
+ new = old & ~NUD_PERMANENT;
+ neigh->nud_state = new;
+ err = 0;
+ goto out;
+ }
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
@@ -1376,7 +1442,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
* we can reinject the packet there.
*/
n2 = NULL;
- if (dst) {
+ if (dst && dst->obsolete != DST_OBSOLETE_DEAD) {
n2 = dst_neigh_lookup_skb(dst, skb);
if (n2)
n1 = n2;
@@ -1395,15 +1461,13 @@ out:
if (update_isrouter)
neigh_update_is_router(neigh, flags, &notify);
write_unlock_bh(&neigh->lock);
-
- if (((new ^ old) & NUD_PERMANENT) || ext_learn_change)
+ if (((new ^ old) & NUD_PERMANENT) || gc_update)
neigh_update_gc_list(neigh);
-
+ if (managed_update)
+ neigh_update_managed_list(neigh);
if (notify)
neigh_update_notify(neigh, nlmsg_pid);
-
trace_neigh_update_done(neigh, err);
-
return err;
}
@@ -1427,7 +1491,8 @@ void __neigh_set_probe_once(struct neighbour *neigh)
neigh->nud_state = NUD_INCOMPLETE;
atomic_set(&neigh->probes, neigh_max_probes(neigh));
neigh_add_timer(neigh,
- jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
+ jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100));
}
EXPORT_SYMBOL(__neigh_set_probe_once);
@@ -1528,6 +1593,20 @@ int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
}
EXPORT_SYMBOL(neigh_direct_output);
+static void neigh_managed_work(struct work_struct *work)
+{
+ struct neigh_table *tbl = container_of(work, struct neigh_table,
+ managed_work.work);
+ struct neighbour *neigh;
+
+ write_lock_bh(&tbl->lock);
+ list_for_each_entry(neigh, &tbl->managed_list, managed_list)
+ neigh_event_send_probe(neigh, NULL, false);
+ queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
+ NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS));
+ write_unlock_bh(&tbl->lock);
+}
+
static void neigh_proxy_process(struct timer_list *t)
{
struct neigh_table *tbl = from_timer(tbl, t, proxy_timer);
@@ -1542,8 +1621,15 @@ static void neigh_proxy_process(struct timer_list *t)
if (tdif <= 0) {
struct net_device *dev = skb->dev;
+ struct in_device *in_dev;
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev)
+ in_dev->arp_parms->qlen--;
+ rcu_read_unlock();
__skb_unlink(skb, &tbl->proxy_queue);
+
if (tbl->proxy_redo && netif_running(dev)) {
rcu_read_lock();
tbl->proxy_redo(skb);
@@ -1565,12 +1651,10 @@ static void neigh_proxy_process(struct timer_list *t)
void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb)
{
- unsigned long now = jiffies;
+ unsigned long sched_next = jiffies +
+ prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY));
- unsigned long sched_next = now + (prandom_u32() %
- NEIGH_VAR(p, PROXY_DELAY));
-
- if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
+ if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) {
kfree_skb(skb);
return;
}
@@ -1586,6 +1670,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
skb_dst_drop(skb);
dev_hold(skb->dev);
__skb_queue_tail(&tbl->proxy_queue, skb);
+ p->qlen++;
mod_timer(&tbl->proxy_timer, sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
@@ -1618,13 +1703,14 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
refcount_set(&p->refcnt, 1);
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
- dev_hold(dev);
+ p->qlen = 0;
+ netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
p->dev = dev;
write_pnet(&p->net, net);
p->sysctl_table = NULL;
if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
- dev_put(dev);
+ netdev_put(dev, &p->dev_tracker);
kfree(p);
return NULL;
}
@@ -1655,8 +1741,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
list_del(&parms->list);
parms->dead = 1;
write_unlock_bh(&tbl->lock);
- if (parms->dev)
- dev_put(parms->dev);
+ netdev_put(parms->dev, &parms->dev_tracker);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
EXPORT_SYMBOL(neigh_parms_release);
@@ -1677,11 +1762,14 @@ void neigh_table_init(int index, struct neigh_table *tbl)
INIT_LIST_HEAD(&tbl->parms_list);
INIT_LIST_HEAD(&tbl->gc_list);
+ INIT_LIST_HEAD(&tbl->managed_list);
+
list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
refcount_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
+ tbl->parms.qlen = 0;
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
@@ -1708,9 +1796,13 @@ void neigh_table_init(int index, struct neigh_table *tbl)
WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
rwlock_init(&tbl->lock);
+
INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
tbl->parms.reachable_time);
+ INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work);
+ queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0);
+
timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0);
skb_queue_head_init_class(&tbl->proxy_queue,
&neigh_table_proxy_queue_class);
@@ -1726,9 +1818,10 @@ int neigh_table_clear(int index, struct neigh_table *tbl)
{
neigh_tables[index] = NULL;
/* It is not clean... Fix it to unload IPv6 module safely */
+ cancel_delayed_work_sync(&tbl->managed_work);
cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+ pneigh_queue_purge(&tbl->proxy_queue, NULL);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
pr_crit("neighbour leakage\n");
@@ -1760,15 +1853,13 @@ static struct neigh_table *neigh_find_table(int family)
case AF_INET6:
tbl = neigh_tables[NEIGH_ND_TABLE];
break;
- case AF_DECnet:
- tbl = neigh_tables[NEIGH_DN_TABLE];
- break;
}
return tbl;
}
const struct nla_policy nda_policy[NDA_MAX+1] = {
+ [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID },
[NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) },
@@ -1779,6 +1870,9 @@ const struct nla_policy nda_policy[NDA_MAX+1] = {
[NDA_IFINDEX] = { .type = NLA_U32 },
[NDA_MASTER] = { .type = NLA_U32 },
[NDA_PROTOCOL] = { .type = NLA_U8 },
+ [NDA_NH_ID] = { .type = NLA_U32 },
+ [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK),
+ [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED },
};
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -1850,7 +1944,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE |
- NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
@@ -1859,6 +1953,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct neighbour *neigh;
void *dst, *lladdr;
u8 protocol = 0;
+ u32 ndm_flags;
int err;
ASSERT_RTNL();
@@ -1874,6 +1969,15 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
}
ndm = nlmsg_data(nlh);
+ ndm_flags = ndm->ndm_flags;
+ if (tb[NDA_FLAGS_EXT]) {
+ u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]);
+
+ BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE <
+ (sizeof(ndm->ndm_flags) * BITS_PER_BYTE +
+ hweight32(NTF_EXT_MASK)));
+ ndm_flags |= (ext << NTF_EXT_SHIFT);
+ }
if (ndm->ndm_ifindex) {
dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
@@ -1901,14 +2005,18 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[NDA_PROTOCOL])
protocol = nla_get_u8(tb[NDA_PROTOCOL]);
-
- if (ndm->ndm_flags & NTF_PROXY) {
+ if (ndm_flags & NTF_PROXY) {
struct pneigh_entry *pn;
+ if (ndm_flags & NTF_MANAGED) {
+ NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination");
+ goto out;
+ }
+
err = -ENOBUFS;
pn = pneigh_lookup(tbl, net, dst, dev, 1);
if (pn) {
- pn->flags = ndm->ndm_flags;
+ pn->flags = ndm_flags;
if (protocol)
pn->protocol = protocol;
err = 0;
@@ -1928,16 +2036,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
neigh = neigh_lookup(tbl, dst, dev);
if (neigh == NULL) {
- bool exempt_from_gc;
+ bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT;
+ bool exempt_from_gc = ndm_permanent ||
+ ndm_flags & NTF_EXT_LEARNED;
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
err = -ENOENT;
goto out;
}
+ if (ndm_permanent && (ndm_flags & NTF_MANAGED)) {
+ NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry");
+ err = -EINVAL;
+ goto out;
+ }
- exempt_from_gc = ndm->ndm_state & NUD_PERMANENT ||
- ndm->ndm_flags & NTF_EXT_LEARNED;
- neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true);
+ neigh = ___neigh_create(tbl, dst, dev,
+ ndm_flags &
+ (NTF_EXT_LEARNED | NTF_MANAGED),
+ exempt_from_gc, true);
if (IS_ERR(neigh)) {
err = PTR_ERR(neigh);
goto out;
@@ -1954,24 +2070,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
}
- if (ndm->ndm_flags & NTF_EXT_LEARNED)
+ if (protocol)
+ neigh->protocol = protocol;
+ if (ndm_flags & NTF_EXT_LEARNED)
flags |= NEIGH_UPDATE_F_EXT_LEARNED;
-
- if (ndm->ndm_flags & NTF_ROUTER)
+ if (ndm_flags & NTF_ROUTER)
flags |= NEIGH_UPDATE_F_ISROUTER;
+ if (ndm_flags & NTF_MANAGED)
+ flags |= NEIGH_UPDATE_F_MANAGED;
+ if (ndm_flags & NTF_USE)
+ flags |= NEIGH_UPDATE_F_USE;
- if (ndm->ndm_flags & NTF_USE) {
+ err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
+ NETLINK_CB(skb).portid, extack);
+ if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) {
neigh_event_send(neigh, NULL);
err = 0;
- } else
- err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
- NETLINK_CB(skb).portid, extack);
-
- if (protocol)
- neigh->protocol = protocol;
-
+ }
neigh_release(neigh);
-
out:
return err;
}
@@ -2015,7 +2131,9 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
nla_put_msecs(skb, NDTPA_PROXY_DELAY,
NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_LOCKTIME,
- NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD))
+ NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS,
+ NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD))
goto nla_put_failure;
return nla_nest_end(skb, nest);
@@ -2170,6 +2288,7 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
[NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
[NDTPA_LOCKTIME] = { .type = NLA_U64 },
+ [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 },
};
static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2288,6 +2407,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
nla_get_msecs(tbp[i]));
call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
break;
+ case NDTPA_INTERVAL_PROBE_TIME_MS:
+ NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS,
+ nla_get_msecs(tbp[i]));
+ break;
case NDTPA_RETRANS_TIME:
NEIGH_VAR_SET(p, RETRANS_TIME,
nla_get_msecs(tbp[i]));
@@ -2422,6 +2545,7 @@ out:
static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
u32 pid, u32 seq, int type, unsigned int flags)
{
+ u32 neigh_flags, neigh_flags_ext;
unsigned long now = jiffies;
struct nda_cacheinfo ci;
struct nlmsghdr *nlh;
@@ -2431,11 +2555,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
if (nlh == NULL)
return -EMSGSIZE;
+ neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT;
+ neigh_flags = neigh->flags & NTF_OLD_MASK;
+
ndm = nlmsg_data(nlh);
ndm->ndm_family = neigh->ops->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
- ndm->ndm_flags = neigh->flags;
+ ndm->ndm_flags = neigh_flags;
ndm->ndm_type = neigh->type;
ndm->ndm_ifindex = neigh->dev->ifindex;
@@ -2466,6 +2593,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol))
goto nla_put_failure;
+ if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext))
+ goto nla_put_failure;
nlmsg_end(skb, nlh);
return 0;
@@ -2479,6 +2608,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
u32 pid, u32 seq, int type, unsigned int flags,
struct neigh_table *tbl)
{
+ u32 neigh_flags, neigh_flags_ext;
struct nlmsghdr *nlh;
struct ndmsg *ndm;
@@ -2486,11 +2616,14 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
if (nlh == NULL)
return -EMSGSIZE;
+ neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT;
+ neigh_flags = pn->flags & NTF_OLD_MASK;
+
ndm = nlmsg_data(nlh);
ndm->ndm_family = tbl->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
- ndm->ndm_flags = pn->flags | NTF_PROXY;
+ ndm->ndm_flags = neigh_flags | NTF_PROXY;
ndm->ndm_type = RTN_UNICAST;
ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0;
ndm->ndm_state = NUD_NONE;
@@ -2500,6 +2633,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol))
goto nla_put_failure;
+ if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext))
+ goto nla_put_failure;
nlmsg_end(skb, nlh);
return 0;
@@ -2523,6 +2658,13 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)
return false;
master = dev ? netdev_master_upper_dev_get(dev) : NULL;
+
+ /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another
+ * invalid value for ifindex to denote "no master".
+ */
+ if (master_idx == -1)
+ return !!master;
+
if (!master || master->ifindex != master_idx)
return true;
@@ -2808,6 +2950,7 @@ static inline size_t neigh_nlmsg_size(void)
+ nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ nla_total_size(sizeof(struct nda_cacheinfo))
+ nla_total_size(4) /* NDA_PROBES */
+ + nla_total_size(4) /* NDA_FLAGS_EXT */
+ nla_total_size(1); /* NDA_PROTOCOL */
}
@@ -2836,6 +2979,7 @@ static inline size_t pneigh_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct ndmsg))
+ nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(4) /* NDA_FLAGS_EXT */
+ nla_total_size(1); /* NDA_PROTOCOL */
}
@@ -3132,7 +3276,7 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
struct pneigh_entry *pn = NULL;
- int bucket = state->bucket;
+ int bucket;
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
@@ -3264,7 +3408,7 @@ EXPORT_SYMBOL(neigh_seq_stop);
static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
+ struct neigh_table *tbl = pde_data(file_inode(seq->file));
int cpu;
if (*pos == 0)
@@ -3281,7 +3425,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
+ struct neigh_table *tbl = pde_data(file_inode(seq->file));
int cpu;
for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
@@ -3301,16 +3445,17 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
static int neigh_stat_seq_show(struct seq_file *seq, void *v)
{
- struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
+ struct neigh_table *tbl = pde_data(file_inode(seq->file));
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
+ seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
return 0;
}
- seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
- "%08lx %08lx %08lx %08lx %08lx %08lx\n",
+ seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
+ "%08lx %08lx %08lx "
+ "%08lx %08lx %08lx\n",
atomic_read(&tbl->entries),
st->allocs,
@@ -3377,7 +3522,7 @@ EXPORT_SYMBOL(neigh_app_ns);
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
static int proc_unres_qlen(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int size, ret;
struct ctl_table tmp = *ctl;
@@ -3441,8 +3586,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
}
static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
struct ctl_table tmp = *ctl;
int ret;
@@ -3455,8 +3600,24 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
return ret;
}
-int neigh_proc_dointvec(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int neigh_proc_dointvec_ms_jiffies_positive(struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *ctl;
+ int ret;
+
+ int min = msecs_to_jiffies(1);
+
+ tmp.extra1 = &min;
+ tmp.extra2 = NULL;
+
+ ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos);
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -3465,8 +3626,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write,
}
EXPORT_SYMBOL(neigh_proc_dointvec);
-int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
+int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3477,8 +3637,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3487,8 +3647,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
}
int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3498,8 +3657,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
@@ -3508,8 +3667,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
}
static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
struct neigh_parms *p = ctl->extra2;
int ret;
@@ -3553,8 +3712,8 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
-#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \
- NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive)
#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
@@ -3574,6 +3733,8 @@ static struct neigh_sysctl_table {
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+ NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS,
+ "interval_probe_time_ms"),
NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
@@ -3626,7 +3787,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
char *p_name;
- t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
+ t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT);
if (!t)
goto err;
@@ -3674,10 +3835,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
neigh_proc_base_reachable_time;
}
- /* Don't export sysctls to unprivileged users */
- if (neigh_parms_net(p)->user_ns != &init_user_ns)
- t->neigh_vars[0].procname = NULL;
-
switch (neigh_parms_family(p)) {
case AF_INET:
p_name = "ipv4";
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 6bbd06f7dc7d..1ec23bf8b05c 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -4,15 +4,14 @@
#include <linux/seq_file.h>
#include <net/wext.h>
+#include "dev.h"
+
#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
#define get_bucket(x) ((x) >> BUCKET_SPACE)
#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
-extern struct list_head ptype_all __read_mostly;
-extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-
static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
{
struct net *net = seq_file_net(seq);
@@ -116,6 +115,12 @@ static int dev_seq_show(struct seq_file *seq, void *v)
return 0;
}
+static u32 softnet_backlog_len(struct softnet_data *sd)
+{
+ return skb_queue_len_lockless(&sd->input_pkt_queue) +
+ skb_queue_len_lockless(&sd->process_queue);
+}
+
static struct softnet_data *softnet_get_online(loff_t *pos)
{
struct softnet_data *sd = NULL;
@@ -159,12 +164,17 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
rcu_read_unlock();
#endif
+ /* the index is the CPU id owing this sd. Since offline CPUs are not
+ * displayed, it would be othrwise not trivial for the user-space
+ * mapping the data a specific CPU
+ */
seq_printf(seq,
- "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
sd->processed, sd->dropped, sd->time_squeeze, 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
- sd->received_rps, flow_limit_count);
+ sd->received_rps, flow_limit_count,
+ softnet_backlog_len(sd), (int)seq->index);
return 0;
}
@@ -182,12 +192,23 @@ static const struct seq_operations softnet_seq_ops = {
.show = softnet_seq_show,
};
-static void *ptype_get_idx(loff_t pos)
+static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
{
+ struct list_head *ptype_list = NULL;
struct packet_type *pt = NULL;
+ struct net_device *dev;
loff_t i = 0;
int t;
+ for_each_netdev_rcu(seq_file_net(seq), dev) {
+ ptype_list = &dev->ptype_all;
+ list_for_each_entry_rcu(pt, ptype_list, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+
list_for_each_entry_rcu(pt, &ptype_all, list) {
if (i == pos)
return pt;
@@ -208,22 +229,40 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock();
- return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+ return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct net_device *dev;
struct packet_type *pt;
struct list_head *nxt;
int hash;
++*pos;
if (v == SEQ_START_TOKEN)
- return ptype_get_idx(0);
+ return ptype_get_idx(seq, 0);
pt = v;
nxt = pt->list.next;
+ if (pt->dev) {
+ if (nxt != &pt->dev->ptype_all)
+ goto found;
+
+ dev = pt->dev;
+ for_each_netdev_continue_rcu(seq_file_net(seq), dev) {
+ if (!list_empty(&dev->ptype_all)) {
+ nxt = dev->ptype_all.next;
+ goto found;
+ }
+ }
+
+ nxt = ptype_all.next;
+ goto ptype_all;
+ }
+
if (pt->type == htons(ETH_P_ALL)) {
+ptype_all:
if (nxt != &ptype_all)
goto found;
hash = 0;
@@ -252,7 +291,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN)
seq_puts(seq, "Type Device Function\n");
- else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
+ else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) &&
+ (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) {
if (pt->type == htons(ETH_P_ALL))
seq_puts(seq, "ALL ");
else
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 4c826b8bf9b1..8409d41405df 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -11,6 +11,7 @@
#include <linux/if_arp.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
+#include <linux/sched/isolation.h>
#include <linux/nsproxy.h>
#include <net/sock.h>
#include <net/net_namespace.h>
@@ -23,6 +24,7 @@
#include <linux/of_net.h>
#include <linux/cpu.h>
+#include "dev.h"
#include "net-sysfs.h"
#ifdef CONFIG_SYSFS
@@ -31,6 +33,7 @@ static const char fmt_dec[] = "%d\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
+/* Caller holds RTNL or dev_base_lock */
static inline int dev_isalive(const struct net_device *dev)
{
return dev->reg_state <= NETREG_REGISTERED;
@@ -56,7 +59,7 @@ static ssize_t netdev_show(const struct device *dev,
#define NETDEVICE_SHOW(field, format_string) \
static ssize_t format_##field(const struct net_device *dev, char *buf) \
{ \
- return sprintf(buf, format_string, dev->field); \
+ return sysfs_emit(buf, format_string, dev->field); \
} \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -80,7 +83,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
struct net_device *netdev = to_net_dev(dev);
struct net *net = dev_net(netdev);
unsigned long new;
- int ret = -EINVAL;
+ int ret;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -115,13 +118,13 @@ static ssize_t iflink_show(struct device *dev, struct device_attribute *attr,
{
struct net_device *ndev = to_net_dev(dev);
- return sprintf(buf, fmt_dec, dev_get_iflink(ndev));
+ return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev));
}
static DEVICE_ATTR_RO(iflink);
static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
{
- return sprintf(buf, fmt_dec, dev->name_assign_type);
+ return sysfs_emit(buf, fmt_dec, dev->name_assign_type);
}
static ssize_t name_assign_type_show(struct device *dev,
@@ -174,6 +177,14 @@ static int change_carrier(struct net_device *dev, unsigned long new_carrier)
static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
+ struct net_device *netdev = to_net_dev(dev);
+
+ /* The check is also done in change_carrier; this helps returning early
+ * without hitting the trylock/restart in netdev_store.
+ */
+ if (!netdev->netdev_ops->ndo_change_carrier)
+ return -EOPNOTSUPP;
+
return netdev_store(dev, attr, buf, len, change_carrier);
}
@@ -183,7 +194,7 @@ static ssize_t carrier_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
if (netif_running(netdev))
- return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
return -EINVAL;
}
@@ -195,14 +206,20 @@ static ssize_t speed_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
int ret = -EINVAL;
+ /* The check is also done in __ethtool_get_link_ksettings; this helps
+ * returning early without hitting the trylock/restart below.
+ */
+ if (!netdev->ethtool_ops->get_link_ksettings)
+ return ret;
+
if (!rtnl_trylock())
return restart_syscall();
- if (netif_running(netdev)) {
+ if (netif_running(netdev) && netif_device_present(netdev)) {
struct ethtool_link_ksettings cmd;
if (!__ethtool_get_link_ksettings(netdev, &cmd))
- ret = sprintf(buf, fmt_dec, cmd.base.speed);
+ ret = sysfs_emit(buf, fmt_dec, cmd.base.speed);
}
rtnl_unlock();
return ret;
@@ -215,6 +232,12 @@ static ssize_t duplex_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
int ret = -EINVAL;
+ /* The check is also done in __ethtool_get_link_ksettings; this helps
+ * returning early without hitting the trylock/restart below.
+ */
+ if (!netdev->ethtool_ops->get_link_ksettings)
+ return ret;
+
if (!rtnl_trylock())
return restart_syscall();
@@ -235,7 +258,7 @@ static ssize_t duplex_show(struct device *dev,
duplex = "unknown";
break;
}
- ret = sprintf(buf, "%s\n", duplex);
+ ret = sysfs_emit(buf, "%s\n", duplex);
}
}
rtnl_unlock();
@@ -243,13 +266,25 @@ static ssize_t duplex_show(struct device *dev,
}
static DEVICE_ATTR_RO(duplex);
+static ssize_t testing_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (netif_running(netdev))
+ return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev));
+
+ return -EINVAL;
+}
+static DEVICE_ATTR_RO(testing);
+
static ssize_t dormant_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
if (netif_running(netdev))
- return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+ return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev));
return -EINVAL;
}
@@ -260,7 +295,7 @@ static const char *const operstates[] = {
"notpresent", /* currently unused */
"down",
"lowerlayerdown",
- "testing", /* currently unused */
+ "testing",
"dormant",
"up"
};
@@ -280,7 +315,7 @@ static ssize_t operstate_show(struct device *dev,
if (operstate >= ARRAY_SIZE(operstates))
return -EINVAL; /* should not happen */
- return sprintf(buf, "%s\n", operstates[operstate]);
+ return sysfs_emit(buf, "%s\n", operstates[operstate]);
}
static DEVICE_ATTR_RO(operstate);
@@ -290,9 +325,9 @@ static ssize_t carrier_changes_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
- return sprintf(buf, fmt_dec,
- atomic_read(&netdev->carrier_up_count) +
- atomic_read(&netdev->carrier_down_count));
+ return sysfs_emit(buf, fmt_dec,
+ atomic_read(&netdev->carrier_up_count) +
+ atomic_read(&netdev->carrier_down_count));
}
static DEVICE_ATTR_RO(carrier_changes);
@@ -302,7 +337,7 @@ static ssize_t carrier_up_count_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
- return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
+ return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
}
static DEVICE_ATTR_RO(carrier_up_count);
@@ -312,7 +347,7 @@ static ssize_t carrier_down_count_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
- return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
+ return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
}
static DEVICE_ATTR_RO(carrier_down_count);
@@ -355,7 +390,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{
- dev->gro_flush_timeout = val;
+ WRITE_ONCE(dev->gro_flush_timeout, val);
return 0;
}
@@ -370,6 +405,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
}
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
+{
+ WRITE_ONCE(dev->napi_defer_hard_irqs, val);
+ return 0;
+}
+
+static ssize_t napi_defer_hard_irqs_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
+}
+NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
+
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
@@ -410,7 +462,7 @@ static ssize_t ifalias_show(struct device *dev,
ret = dev_get_alias(netdev, tmp, sizeof(tmp));
if (ret > 0)
- ret = sprintf(buf, "%s\n", tmp);
+ ret = sysfs_emit(buf, "%s\n", tmp);
return ret;
}
static DEVICE_ATTR_RW(ifalias);
@@ -448,6 +500,12 @@ static ssize_t phys_port_id_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
ssize_t ret = -EINVAL;
+ /* The check is also done in dev_get_phys_port_id; this helps returning
+ * early without hitting the trylock/restart below.
+ */
+ if (!netdev->netdev_ops->ndo_get_phys_port_id)
+ return -EOPNOTSUPP;
+
if (!rtnl_trylock())
return restart_syscall();
@@ -456,7 +514,7 @@ static ssize_t phys_port_id_show(struct device *dev,
ret = dev_get_phys_port_id(netdev, &ppid);
if (!ret)
- ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
}
rtnl_unlock();
@@ -470,6 +528,13 @@ static ssize_t phys_port_name_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
ssize_t ret = -EINVAL;
+ /* The checks are also done in dev_get_phys_port_name; this helps
+ * returning early without hitting the trylock/restart below.
+ */
+ if (!netdev->netdev_ops->ndo_get_phys_port_name &&
+ !netdev->netdev_ops->ndo_get_devlink_port)
+ return -EOPNOTSUPP;
+
if (!rtnl_trylock())
return restart_syscall();
@@ -478,7 +543,7 @@ static ssize_t phys_port_name_show(struct device *dev,
ret = dev_get_phys_port_name(netdev, name, sizeof(name));
if (!ret)
- ret = sprintf(buf, "%s\n", name);
+ ret = sysfs_emit(buf, "%s\n", name);
}
rtnl_unlock();
@@ -492,6 +557,14 @@ static ssize_t phys_switch_id_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
ssize_t ret = -EINVAL;
+ /* The checks are also done in dev_get_phys_port_name; this helps
+ * returning early without hitting the trylock/restart below. This works
+ * because recurse is false when calling dev_get_port_parent_id.
+ */
+ if (!netdev->netdev_ops->ndo_get_port_parent_id &&
+ !netdev->netdev_ops->ndo_get_devlink_port)
+ return -EOPNOTSUPP;
+
if (!rtnl_trylock())
return restart_syscall();
@@ -500,7 +573,7 @@ static ssize_t phys_switch_id_show(struct device *dev,
ret = dev_get_port_parent_id(netdev, &ppid, false);
if (!ret)
- ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
}
rtnl_unlock();
@@ -508,6 +581,45 @@ static ssize_t phys_switch_id_show(struct device *dev,
}
static DEVICE_ATTR_RO(phys_switch_id);
+static ssize_t threaded_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev))
+ ret = sysfs_emit(buf, fmt_dec, netdev->threaded);
+
+ rtnl_unlock();
+ return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ int ret;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ if (val != 0 && val != 1)
+ return -EOPNOTSUPP;
+
+ ret = dev_set_threaded(dev, val);
+
+ return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
@@ -524,6 +636,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_speed.attr,
&dev_attr_duplex.attr,
&dev_attr_dormant.attr,
+ &dev_attr_testing.attr,
&dev_attr_operstate.attr,
&dev_attr_carrier_changes.attr,
&dev_attr_ifalias.attr,
@@ -532,12 +645,14 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_flags.attr,
&dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr,
+ &dev_attr_napi_defer_hard_irqs.attr,
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,
&dev_attr_proto_down.attr,
&dev_attr_carrier_up_count.attr,
&dev_attr_carrier_down_count.attr,
+ &dev_attr_threaded.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
@@ -558,7 +673,7 @@ static ssize_t netstat_show(const struct device *d,
struct rtnl_link_stats64 temp;
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
- ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
+ ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
}
read_unlock(&dev_base_lock);
return ret;
@@ -632,7 +747,6 @@ static const struct attribute_group netstat_group = {
.attrs = netstat_attrs,
};
-#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
static struct attribute *wireless_attrs[] = {
NULL
};
@@ -641,7 +755,19 @@ static const struct attribute_group wireless_group = {
.name = "wireless",
.attrs = wireless_attrs,
};
+
+static bool wireless_group_needed(struct net_device *ndev)
+{
+#if IS_ENABLED(CONFIG_CFG80211)
+ if (ndev->ieee80211_ptr)
+ return true;
+#endif
+#if IS_ENABLED(CONFIG_WIRELESS_EXT)
+ if (ndev->wireless_handlers)
+ return true;
#endif
+ return false;
+}
#else /* CONFIG_SYSFS */
#define net_class_groups NULL
@@ -698,7 +824,7 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
for (i = 0; i < map->len; i++)
cpumask_set_cpu(map->cpus[i], mask);
- len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
+ len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask));
rcu_read_unlock();
free_cpumask_var(mask);
@@ -725,6 +851,15 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
return err;
}
+ if (!cpumask_empty(mask)) {
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
+ if (cpumask_empty(mask)) {
+ free_cpumask_var(mask);
+ return -EINVAL;
+ }
+ }
+
map = kzalloc(max_t(unsigned int,
RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
GFP_KERNEL);
@@ -775,7 +910,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
val = (unsigned long)flow_table->mask + 1;
rcu_read_unlock();
- return sprintf(buf, "%lu\n", val);
+ return sysfs_emit(buf, "%lu\n", val);
}
static void rps_dev_flow_table_release(struct rcu_head *rcu)
@@ -882,7 +1017,7 @@ static void rx_queue_release(struct kobject *kobj)
#endif
memset(kobj, 0, sizeof(*kobj));
- dev_put(queue->dev);
+ netdev_put(queue->dev, &queue->dev_tracker);
}
static const void *rx_queue_namespace(struct kobject *kobj)
@@ -922,7 +1057,7 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
/* Kobject_put later will trigger rx_queue_release call which
* decreases dev refcount: Take that reference here
*/
- dev_hold(queue->dev);
+ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL);
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
@@ -944,6 +1079,24 @@ err:
kobject_put(kobj);
return error;
}
+
+static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct netdev_rx_queue *queue = dev->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+ if (dev->sysfs_rx_queue_group)
+ error = sysfs_group_change_owner(
+ kobj, dev->sysfs_rx_queue_group, kuid, kgid);
+
+ return error;
+}
#endif /* CONFIG_SYSFS */
int
@@ -968,7 +1121,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct kobject *kobj = &dev->_rx[i].kobj;
- if (!refcount_read(&dev_net(dev)->count))
+ if (!refcount_read(&dev_net(dev)->ns.count))
kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
@@ -981,6 +1134,29 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
#endif
}
+static int net_rx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+#ifndef CONFIG_RPS
+ if (!dev->sysfs_rx_queue_group)
+ return 0;
+#endif
+ for (i = 0; i < num; i++) {
+ error = rx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif
+}
+
#ifdef CONFIG_SYSFS
/*
* netdev_queue sysfs structures and functions.
@@ -1030,13 +1206,9 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
{
- unsigned long trans_timeout;
+ unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout);
- spin_lock_irq(&queue->_xmit_lock);
- trans_timeout = queue->trans_timeout;
- spin_unlock_irq(&queue->_xmit_lock);
-
- return sprintf(buf, "%lu", trans_timeout);
+ return sysfs_emit(buf, fmt_ulong, trans_timeout);
}
static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
@@ -1054,18 +1226,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
char *buf)
{
struct net_device *dev = queue->dev;
+ int num_tc, tc;
int index;
- int tc;
if (!netif_is_multiqueue(dev))
return -ENOENT;
+ if (!rtnl_trylock())
+ return restart_syscall();
+
index = get_netdev_queue_index(queue);
/* If queue belongs to subordinate dev use its TC mapping */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+ num_tc = dev->num_tc;
tc = netdev_txq_to_tc(dev, index);
+
+ rtnl_unlock();
+
if (tc < 0)
return -EINVAL;
@@ -1076,15 +1255,15 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
* belongs to the root device it will be reported with just the
* traffic class, so just "0" for TC 0 for example.
*/
- return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
- sprintf(buf, "%u\n", tc);
+ return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) :
+ sysfs_emit(buf, "%d\n", tc);
}
#ifdef CONFIG_XPS
static ssize_t tx_maxrate_show(struct netdev_queue *queue,
char *buf)
{
- return sprintf(buf, "%lu\n", queue->tx_maxrate);
+ return sysfs_emit(buf, "%lu\n", queue->tx_maxrate);
}
static ssize_t tx_maxrate_store(struct netdev_queue *queue,
@@ -1097,6 +1276,12 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
+ /* The check is also done later; this helps returning early without
+ * hitting the trylock/restart below.
+ */
+ if (!dev->netdev_ops->ndo_set_tx_maxrate)
+ return -EOPNOTSUPP;
+
err = kstrtou32(buf, 10, &rate);
if (err < 0)
return err;
@@ -1132,7 +1317,7 @@ static struct netdev_queue_attribute queue_traffic_class __ro_after_init
*/
static ssize_t bql_show(char *buf, unsigned int value)
{
- return sprintf(buf, "%u\n", value);
+ return sysfs_emit(buf, "%u\n", value);
}
static ssize_t bql_set(const char *buf, const size_t count,
@@ -1161,7 +1346,7 @@ static ssize_t bql_show_hold_time(struct netdev_queue *queue,
{
struct dql *dql = &queue->dql;
- return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
}
static ssize_t bql_set_hold_time(struct netdev_queue *queue,
@@ -1189,7 +1374,7 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue,
{
struct dql *dql = &queue->dql;
- return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
+ return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed);
}
static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
@@ -1232,68 +1417,94 @@ static const struct attribute_group dql_group = {
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
-static ssize_t xps_cpus_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t xps_queue_show(struct net_device *dev, unsigned int index,
+ int tc, char *buf, enum xps_map_type type)
{
- struct net_device *dev = queue->dev;
- int cpu, len, num_tc = 1, tc = 0;
struct xps_dev_maps *dev_maps;
- cpumask_var_t mask;
- unsigned long index;
+ unsigned long *mask;
+ unsigned int nr_ids;
+ int j, len;
- if (!netif_is_multiqueue(dev))
- return -ENOENT;
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps[type]);
- index = get_netdev_queue_index(queue);
+ /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0
+ * when dev_maps hasn't been allocated yet, to be backward compatible.
+ */
+ nr_ids = dev_maps ? dev_maps->nr_ids :
+ (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues);
- if (dev->num_tc) {
- /* Do not allow XPS on subordinate device directly */
- num_tc = dev->num_tc;
- if (num_tc < 0)
- return -EINVAL;
+ mask = bitmap_zalloc(nr_ids, GFP_NOWAIT);
+ if (!mask) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
- /* If queue belongs to subordinate dev use its map */
- dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+ if (!dev_maps || tc >= dev_maps->num_tc)
+ goto out_no_maps;
- tc = netdev_txq_to_tc(dev, index);
- if (tc < 0)
- return -EINVAL;
- }
+ for (j = 0; j < nr_ids; j++) {
+ int i, tci = j * dev_maps->num_tc + tc;
+ struct xps_map *map;
- if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_cpus_map);
- if (dev_maps) {
- for_each_possible_cpu(cpu) {
- int i, tci = cpu * num_tc + tc;
- struct xps_map *map;
-
- map = rcu_dereference(dev_maps->attr_map[tci]);
- if (!map)
- continue;
-
- for (i = map->len; i--;) {
- if (map->queues[i] == index) {
- cpumask_set_cpu(cpu, mask);
- break;
- }
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ __set_bit(j, mask);
+ break;
}
}
}
+out_no_maps:
rcu_read_unlock();
- len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
- free_cpumask_var(mask);
+ len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids);
+ bitmap_free(mask);
+
return len < PAGE_SIZE ? len : -EINVAL;
}
+static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int index;
+ int len, tc;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
+ index = get_netdev_queue_index(queue);
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ /* Make sure the subordinate device can't be freed */
+ get_device(&dev->dev);
+ rtnl_unlock();
+
+ len = xps_queue_show(dev, index, tc, buf, XPS_CPUS);
+
+ put_device(&dev->dev);
+ return len;
+}
+
static ssize_t xps_cpus_store(struct netdev_queue *queue,
const char *buf, size_t len)
{
struct net_device *dev = queue->dev;
- unsigned long index;
+ unsigned int index;
cpumask_var_t mask;
int err;
@@ -1314,7 +1525,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
return err;
}
+ if (!rtnl_trylock()) {
+ free_cpumask_var(mask);
+ return restart_syscall();
+ }
+
err = netif_set_xps_queue(dev, mask, index);
+ rtnl_unlock();
free_cpumask_var(mask);
@@ -1327,50 +1544,20 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
- struct xps_dev_maps *dev_maps;
- unsigned long *mask, index;
- int j, len, num_tc = 1, tc = 0;
+ unsigned int index;
+ int tc;
index = get_netdev_queue_index(queue);
- if (dev->num_tc) {
- num_tc = dev->num_tc;
- tc = netdev_txq_to_tc(dev, index);
- if (tc < 0)
- return -EINVAL;
- }
- mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
- if (!mask)
- return -ENOMEM;
-
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_rxqs_map);
- if (!dev_maps)
- goto out_no_maps;
-
- for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
- j < dev->num_rx_queues;) {
- int i, tci = j * num_tc + tc;
- struct xps_map *map;
-
- map = rcu_dereference(dev_maps->attr_map[tci]);
- if (!map)
- continue;
-
- for (i = map->len; i--;) {
- if (map->queues[i] == index) {
- set_bit(j, mask);
- break;
- }
- }
- }
-out_no_maps:
- rcu_read_unlock();
+ if (!rtnl_trylock())
+ return restart_syscall();
- len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
- bitmap_free(mask);
+ tc = netdev_txq_to_tc(dev, index);
+ rtnl_unlock();
+ if (tc < 0)
+ return -EINVAL;
- return len < PAGE_SIZE ? len : -EINVAL;
+ return xps_queue_show(dev, index, tc, buf, XPS_RXQS);
}
static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
@@ -1378,7 +1565,8 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
{
struct net_device *dev = queue->dev;
struct net *net = dev_net(dev);
- unsigned long *mask, index;
+ unsigned long *mask;
+ unsigned int index;
int err;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -1396,10 +1584,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
return err;
}
+ if (!rtnl_trylock()) {
+ bitmap_free(mask);
+ return restart_syscall();
+ }
+
cpus_read_lock();
- err = __netif_set_xps_queue(dev, mask, index, true);
+ err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS);
cpus_read_unlock();
+ rtnl_unlock();
+
bitmap_free(mask);
return err ? : len;
}
@@ -1425,7 +1620,7 @@ static void netdev_queue_release(struct kobject *kobj)
struct netdev_queue *queue = to_netdev_queue(kobj);
memset(kobj, 0, sizeof(*kobj));
- dev_put(queue->dev);
+ netdev_put(queue->dev, &queue->dev_tracker);
}
static const void *netdev_queue_namespace(struct kobject *kobj)
@@ -1465,7 +1660,7 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
/* Kobject_put later will trigger netdev_queue_release call
* which decreases dev refcount: Take that reference here
*/
- dev_hold(queue->dev);
+ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL);
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
@@ -1486,6 +1681,23 @@ err:
kobject_put(kobj);
return error;
}
+
+static int tx_queue_change_owner(struct net_device *ndev, int index,
+ kuid_t kuid, kgid_t kgid)
+{
+ struct netdev_queue *queue = ndev->_tx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+#ifdef CONFIG_BQL
+ error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid);
+#endif
+ return error;
+}
#endif /* CONFIG_SYSFS */
int
@@ -1495,6 +1707,13 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
int i;
int error = 0;
+ /* Tx queue kobjects are allowed to be updated when a device is being
+ * unregistered, but solely to remove queues from qdiscs. Any path
+ * adding queues should be fixed.
+ */
+ WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num,
+ "New queues can't be registered after device unregistration.");
+
for (i = old_num; i < new_num; i++) {
error = netdev_queue_add_kobject(dev, i);
if (error) {
@@ -1506,7 +1725,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
- if (!refcount_read(&dev_net(dev)->count))
+ if (!refcount_read(&dev_net(dev)->ns.count))
queue->kobj.uevent_suppress = 1;
#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
@@ -1520,6 +1739,25 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
#endif /* CONFIG_SYSFS */
}
+static int net_tx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ error = tx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif /* CONFIG_SYSFS */
+}
+
static int register_queue_kobjects(struct net_device *dev)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
@@ -1554,6 +1792,31 @@ error:
return error;
}
+static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid)
+{
+ int error = 0, real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ if (ndev->queues_kset) {
+ error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid);
+ if (error)
+ return error;
+ }
+ real_rx = ndev->real_num_rx_queues;
+#endif
+ real_tx = ndev->real_num_tx_queues;
+
+ error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid);
+ if (error)
+ return error;
+
+ error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
static void remove_queue_kobjects(struct net_device *dev)
{
int real_rx = 0, real_tx = 0;
@@ -1565,6 +1828,9 @@ static void remove_queue_kobjects(struct net_device *dev)
net_rx_queue_update_kobjects(dev, real_rx, 0);
netdev_queue_update_kobjects(dev, real_tx, 0);
+
+ dev->real_num_rx_queues = 0;
+ dev->real_num_tx_queues = 0;
#ifdef CONFIG_SYSFS
kset_unregister(dev->queues_kset);
#endif
@@ -1669,15 +1935,15 @@ static struct class net_class __ro_after_init = {
.get_ownership = net_get_ownership,
};
-#ifdef CONFIG_OF_NET
+#ifdef CONFIG_OF
static int of_dev_node_match(struct device *dev, const void *data)
{
- int ret = 0;
-
- if (dev->parent)
- ret = dev->parent->of_node == data;
+ for (; dev; dev = dev->parent) {
+ if (dev->of_node == data)
+ return 1;
+ }
- return ret == 0 ? dev->of_node == data : ret;
+ return 0;
}
/*
@@ -1709,7 +1975,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &ndev->dev;
- if (!refcount_read(&dev_net(ndev)->count))
+ if (!refcount_read(&dev_net(ndev)->ns.count))
dev_set_uevent_suppress(dev, 1);
kobject_get(&dev->kobj);
@@ -1742,14 +2008,8 @@ int netdev_register_kobject(struct net_device *ndev)
*groups++ = &netstat_group;
-#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
- if (ndev->ieee80211_ptr)
+ if (wireless_group_needed(ndev))
*groups++ = &wireless_group;
-#if IS_ENABLED(CONFIG_WIRELESS_EXT)
- else if (ndev->wireless_handlers)
- *groups++ = &wireless_group;
-#endif
-#endif
#endif /* CONFIG_SYSFS */
error = device_add(dev);
@@ -1767,6 +2027,37 @@ int netdev_register_kobject(struct net_device *ndev)
return error;
}
+/* Change owner for sysfs entries when moving network devices across network
+ * namespaces owned by different user namespaces.
+ */
+int netdev_change_owner(struct net_device *ndev, const struct net *net_old,
+ const struct net *net_new)
+{
+ kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID;
+ kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID;
+ struct device *dev = &ndev->dev;
+ int error;
+
+ net_ns_get_ownership(net_old, &old_uid, &old_gid);
+ net_ns_get_ownership(net_new, &new_uid, &new_gid);
+
+ /* The network namespace was changed but the owning user namespace is
+ * identical so there's no need to change the owner of sysfs entries.
+ */
+ if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid))
+ return 0;
+
+ error = device_change_owner(dev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ error = queue_change_owner(ndev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
int netdev_class_create_file_ns(const struct class_attribute *class_attr,
const void *ns)
{
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 006876c7b78d..8a5b04c2699a 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -8,5 +8,7 @@ void netdev_unregister_kobject(struct net_device *);
int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
int netdev_queue_update_kobjects(struct net_device *net,
int old_num, int new_num);
+int netdev_change_owner(struct net_device *, const struct net *net_old,
+ const struct net *net_new);
#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 283ddb2dbc7d..c40cd8dd75c7 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -60,3 +60,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 757cc1d084e7..f64654df71a2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -19,6 +19,7 @@
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
#include <linux/uidgid.h>
+#include <linux/cookie.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -43,13 +44,7 @@ EXPORT_SYMBOL_GPL(net_rwsem);
static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
#endif
-struct net init_net = {
- .count = REFCOUNT_INIT(1),
- .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
-#ifdef CONFIG_KEYS
- .key_domain = &init_net_key_domain,
-#endif
-};
+struct net init_net;
EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
@@ -69,6 +64,8 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+DEFINE_COOKIE(net_cookie);
+
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
@@ -95,7 +92,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
}
ng = net_alloc_generic();
- if (ng == NULL)
+ if (!ng)
return -ENOMEM;
/*
@@ -120,6 +117,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
static int ops_init(const struct pernet_operations *ops, struct net *net)
{
+ struct net_generic *ng;
int err = -ENOMEM;
void *data = NULL;
@@ -138,20 +136,19 @@ static int ops_init(const struct pernet_operations *ops, struct net *net)
if (!err)
return 0;
+ if (ops->id && ops->size) {
cleanup:
+ ng = rcu_dereference_protected(net->gen,
+ lockdep_is_held(&pernet_ops_rwsem));
+ ng->ptr[*ops->id] = NULL;
+ }
+
kfree(data);
out:
return err;
}
-static void ops_free(const struct pernet_operations *ops, struct net *net)
-{
- if (ops->id && ops->size) {
- kfree(net_generic(net, *ops->id));
- }
-}
-
static void ops_pre_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
@@ -168,8 +165,10 @@ static void ops_exit_list(const struct pernet_operations *ops,
{
struct net *net;
if (ops->exit) {
- list_for_each_entry(net, net_exit_list, exit_list)
+ list_for_each_entry(net, net_exit_list, exit_list) {
ops->exit(net);
+ cond_resched();
+ }
}
if (ops->exit_batch)
ops->exit_batch(net_exit_list);
@@ -181,7 +180,7 @@ static void ops_free_list(const struct pernet_operations *ops,
struct net *net;
if (ops->size && ops->id) {
list_for_each_entry(net, net_exit_list, exit_list)
- ops_free(ops, net);
+ kfree(net_generic(net, *ops->id));
}
}
@@ -234,13 +233,13 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
int id;
- if (refcount_read(&net->count) == 0)
+ if (refcount_read(&net->ns.count) == 0)
return NETNSA_NSID_NOT_ASSIGNED;
- spin_lock(&net->nsid_lock);
+ spin_lock_bh(&net->nsid_lock);
id = __peernet2id(net, peer);
if (id >= 0) {
- spin_unlock(&net->nsid_lock);
+ spin_unlock_bh(&net->nsid_lock);
return id;
}
@@ -250,12 +249,12 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
* just been idr_remove()'d from there in cleanup_net().
*/
if (!maybe_get_net(peer)) {
- spin_unlock(&net->nsid_lock);
+ spin_unlock_bh(&net->nsid_lock);
return NETNSA_NSID_NOT_ASSIGNED;
}
id = alloc_netid(net, peer, -1);
- spin_unlock(&net->nsid_lock);
+ spin_unlock_bh(&net->nsid_lock);
put_net(peer);
if (id < 0)
@@ -303,6 +302,7 @@ struct net *get_net_ns_by_id(const struct net *net, int id)
return peer;
}
+EXPORT_SYMBOL_GPL(get_net_ns_by_id);
/*
* setup_net runs the initializers for the network namespace object.
@@ -314,9 +314,14 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
int error = 0;
LIST_HEAD(net_exit_list);
- refcount_set(&net->count, 1);
+ refcount_set(&net->ns.count, 1);
+ ref_tracker_dir_init(&net->refcnt_tracker, 128);
+
refcount_set(&net->passive, 1);
get_random_bytes(&net->hash_mix, sizeof(u32));
+ preempt_disable();
+ net->net_cookie = gen_cookie_next(&net_cookie);
+ preempt_enable();
net->dev_base_seq = 1;
net->user_ns = user_ns;
idr_init(&net->netns_ids);
@@ -360,6 +365,8 @@ out_undo:
static int __net_init net_defaults_init_net(struct net *net)
{
net->core.sysctl_somaxconn = SOMAXCONN;
+ net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+
return 0;
}
@@ -427,15 +434,18 @@ out_free:
static void net_free(struct net *net)
{
- kfree(rcu_access_pointer(net->gen));
- kmem_cache_free(net_cachep, net);
+ if (refcount_dec_and_test(&net->passive)) {
+ kfree(rcu_access_pointer(net->gen));
+ kmem_cache_free(net_cachep, net);
+ }
}
void net_drop_ns(void *p)
{
- struct net *ns = p;
- if (ns && refcount_dec_and_test(&ns->passive))
- net_free(ns);
+ struct net *net = (struct net *)p;
+
+ if (net)
+ net_free(net);
}
struct net *copy_net_ns(unsigned long flags,
@@ -471,9 +481,11 @@ struct net *copy_net_ns(unsigned long flags,
if (rv < 0) {
put_userns:
+#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
+#endif
put_user_ns(user_ns);
- net_drop_ns(net);
+ net_free(net);
dec_ucounts:
dec_net_namespaces(ucounts);
return ERR_PTR(rv);
@@ -520,20 +532,20 @@ static void unhash_nsid(struct net *net, struct net *last)
for_each_net(tmp) {
int id;
- spin_lock(&tmp->nsid_lock);
+ spin_lock_bh(&tmp->nsid_lock);
id = __peernet2id(tmp, net);
if (id >= 0)
idr_remove(&tmp->netns_ids, id);
- spin_unlock(&tmp->nsid_lock);
+ spin_unlock_bh(&tmp->nsid_lock);
if (id >= 0)
rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
GFP_KERNEL);
if (tmp == last)
break;
}
- spin_lock(&net->nsid_lock);
+ spin_lock_bh(&net->nsid_lock);
idr_destroy(&net->netns_ids);
- spin_unlock(&net->nsid_lock);
+ spin_unlock_bh(&net->nsid_lock);
}
static LLIST_HEAD(cleanup_list);
@@ -603,9 +615,11 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
dec_net_namespaces(net->ucounts);
+#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
+#endif
put_user_ns(net->user_ns);
- net_drop_ns(net);
+ net_free(net);
}
}
@@ -629,12 +643,25 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net);
void __put_net(struct net *net)
{
+ ref_tracker_dir_exit(&net->refcnt_tracker);
/* Cleanup the network namespace in process context */
if (llist_add(&net->cleanup_list, &cleanup_list))
queue_work(netns_wq, &net_cleanup_work);
}
EXPORT_SYMBOL_GPL(__put_net);
+/**
+ * get_net_ns - increment the refcount of the network namespace
+ * @ns: common namespace (net)
+ *
+ * Returns the net's common namespace.
+ */
+struct ns_common *get_net_ns(struct ns_common *ns)
+{
+ return &get_net(container_of(ns, struct net, ns))->ns;
+}
+EXPORT_SYMBOL_GPL(get_net_ns);
+
struct net *get_net_ns_by_fd(int fd)
{
struct file *file;
@@ -654,14 +681,8 @@ struct net *get_net_ns_by_fd(int fd)
fput(file);
return net;
}
-
-#else
-struct net *get_net_ns_by_fd(int fd)
-{
- return ERR_PTR(-EINVAL);
-}
-#endif
EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
+#endif
struct net *get_net_ns_by_pid(pid_t pid)
{
@@ -746,9 +767,9 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
return PTR_ERR(peer);
}
- spin_lock(&net->nsid_lock);
+ spin_lock_bh(&net->nsid_lock);
if (__peernet2id(net, peer) >= 0) {
- spin_unlock(&net->nsid_lock);
+ spin_unlock_bh(&net->nsid_lock);
err = -EEXIST;
NL_SET_BAD_ATTR(extack, nla);
NL_SET_ERR_MSG(extack,
@@ -757,7 +778,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
}
err = alloc_netid(net, peer, nsid);
- spin_unlock(&net->nsid_lock);
+ spin_unlock_bh(&net->nsid_lock);
if (err >= 0) {
rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
nlh, GFP_KERNEL);
@@ -1067,7 +1088,7 @@ out:
rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}
-static int __init net_ns_init(void)
+void __init net_ns_init(void)
{
struct net_generic *ng;
@@ -1088,6 +1109,9 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
+#ifdef CONFIG_KEYS
+ init_net.key_domain = &init_net_key_domain;
+#endif
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
@@ -1102,11 +1126,15 @@ static int __init net_ns_init(void)
RTNL_FLAG_DOIT_UNLOCKED);
rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
RTNL_FLAG_DOIT_UNLOCKED);
-
- return 0;
}
-pure_initcall(net_ns_init);
+static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
+{
+ ops_pre_exit_list(ops, net_exit_list);
+ synchronize_rcu();
+ ops_exit_list(ops, net_exit_list);
+ ops_free_list(ops, net_exit_list);
+}
#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
@@ -1133,10 +1161,7 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
- ops_pre_exit_list(ops, &net_exit_list);
- synchronize_rcu();
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+ free_exit_list(ops, &net_exit_list);
return error;
}
@@ -1149,10 +1174,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
- ops_pre_exit_list(ops, &net_exit_list);
- synchronize_rcu();
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+
+ free_exit_list(ops, &net_exit_list);
}
#else
@@ -1175,10 +1198,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
} else {
LIST_HEAD(net_exit_list);
list_add(&init_net.exit_list, &net_exit_list);
- ops_pre_exit_list(ops, &net_exit_list);
- synchronize_rcu();
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+ free_exit_list(ops, &net_exit_list);
}
}
@@ -1338,12 +1358,13 @@ static void netns_put(struct ns_common *ns)
put_net(to_net_ns(ns));
}
-static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int netns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct net *net = to_net_ns(ns);
if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
put_net(nsproxy->net_ns);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b4c87fe31be2..d6a70aeaa503 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -66,17 +66,13 @@ struct update_classid_context {
#define UPDATE_CLASSID_BATCH 1000
-static int update_classid_sock(const void *v, struct file *file, unsigned n)
+static int update_classid_sock(const void *v, struct file *file, unsigned int n)
{
- int err;
struct update_classid_context *ctx = (void *)v;
- struct socket *sock = sock_from_file(file, &err);
+ struct socket *sock = sock_from_file(file);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+ if (sock)
sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
- spin_unlock(&cgroup_sk_update_lock);
- }
if (--ctx->batch == 0) {
ctx->batch = UPDATE_CLASSID_BATCH;
return n + 1;
@@ -122,15 +118,11 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
struct css_task_iter it;
struct task_struct *p;
- cgroup_sk_alloc_disable();
-
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);
- while ((p = css_task_iter_next(&it))) {
+ while ((p = css_task_iter_next(&it)))
update_classid_task(p, cs->classid);
- cond_resched();
- }
css_task_iter_end(&it);
return 0;
diff --git a/net/core/netevent.c b/net/core/netevent.c
index d76ed7739c70..5bb615e963cc 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -32,7 +32,7 @@ int register_netevent_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(register_netevent_notifier);
/**
- * netevent_unregister_notifier - unregister a netevent notifier block
+ * unregister_netevent_notifier - unregister a netevent notifier block
* @nb: notifier
*
* Unregister a notifier previously registered by
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 849380a622ef..9be762e1d042 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -36,6 +36,7 @@
#include <net/ip6_checksum.h>
#include <asm/unaligned.h>
#include <trace/events/napi.h>
+#include <linux/kconfig.h>
/*
* We maintain a small pool of fully-sized skbs, to make sure the
@@ -69,10 +70,11 @@ module_param(carrier_timeout, uint, 0644);
#define np_notice(np, fmt, ...) \
pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
-static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
- struct netdev_queue *txq)
+static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netdev_queue *txq)
{
- int status = NETDEV_TX_OK;
+ netdev_tx_t status = NETDEV_TX_OK;
netdev_features_t features;
features = netif_skb_features(skb);
@@ -161,7 +163,7 @@ static void poll_napi(struct net_device *dev)
struct napi_struct *napi;
int cpu = smp_processor_id();
- list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
poll_one_napi(napi);
smp_store_release(&napi->poll_owner, -1);
@@ -296,7 +298,7 @@ static int netpoll_owner_active(struct net_device *dev)
{
struct napi_struct *napi;
- list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
if (napi->poll_owner == smp_processor_id())
return 1;
}
@@ -304,20 +306,22 @@ static int netpoll_owner_active(struct net_device *dev)
}
/* call with IRQ disabled */
-void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
- struct net_device *dev)
+static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
- int status = NETDEV_TX_BUSY;
+ netdev_tx_t status = NETDEV_TX_BUSY;
+ struct net_device *dev;
unsigned long tries;
/* It is up to the caller to keep npinfo alive. */
struct netpoll_info *npinfo;
lockdep_assert_irqs_disabled();
- npinfo = rcu_dereference_bh(np->dev->npinfo);
+ dev = np->dev;
+ npinfo = rcu_dereference_bh(dev->npinfo);
+
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
dev_kfree_skb_irq(skb);
- return;
+ return NET_XMIT_DROP;
}
/* don't get messages out of order, and no recursion */
@@ -356,8 +360,25 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
skb_queue_tail(&npinfo->txq, skb);
schedule_delayed_work(&npinfo->tx_work,0);
}
+ return NETDEV_TX_OK;
}
-EXPORT_SYMBOL(netpoll_send_skb_on_dev);
+
+netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+{
+ unsigned long flags;
+ netdev_tx_t ret;
+
+ if (unlikely(!np)) {
+ dev_kfree_skb_irq(skb);
+ ret = NET_XMIT_DROP;
+ } else {
+ local_irq_save(flags);
+ ret = __netpoll_send_skb(np, skb);
+ local_irq_restore(flags);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netpoll_send_skb);
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
@@ -369,7 +390,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
static atomic_t ip_ident;
struct ipv6hdr *ip6h;
- WARN_ON_ONCE(!irqs_disabled());
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ WARN_ON_ONCE(!irqs_disabled());
udp_len = len + sizeof(*udph);
if (np->ipv6)
@@ -408,7 +430,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
ip6h = ipv6_hdr(skb);
/* ip6h->version = 6; ip6h->priority = 0; */
- put_unaligned(0x60, (unsigned char *)ip6h);
+ *(unsigned char *)ip6h = 0x60;
ip6h->flow_lbl[0] = 0;
ip6h->flow_lbl[1] = 0;
ip6h->flow_lbl[2] = 0;
@@ -436,7 +458,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
iph = ip_hdr(skb);
/* iph->version = 4; iph->ihl = 5; */
- put_unaligned(0x45, (unsigned char *)iph);
+ *(unsigned char *)iph = 0x45;
iph->tos = 0;
put_unaligned(htons(ip_len), &(iph->tot_len));
iph->id = htons(atomic_inc_return(&ip_ident));
@@ -534,7 +556,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
if ((delim = strchr(cur, ',')) == NULL)
goto parse_failed;
*delim = 0;
- strlcpy(np->dev_name, cur, sizeof(np->dev_name));
+ strscpy(np->dev_name, cur, sizeof(np->dev_name));
cur = delim;
}
cur++;
@@ -588,7 +610,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
int err;
np->dev = ndev;
- strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
+ strscpy(np->dev_name, ndev->name, IFNAMSIZ);
if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
np_err(np, "%s doesn't support polling, aborting\n",
@@ -754,7 +776,7 @@ put_noaddr:
err = __netpoll_setup(np, ndev);
if (err)
goto put;
-
+ netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL);
rtnl_unlock();
return 0;
@@ -831,7 +853,7 @@ void netpoll_cleanup(struct netpoll *np)
if (!np->dev)
goto out;
__netpoll_cleanup(np);
- dev_put(np->dev);
+ netdev_put(np->dev, &np->dev_tracker);
np->dev = NULL;
out:
rtnl_unlock();
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 8881dd943dd0..8456dfbe2eb4 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
- cgroup_sk_alloc_disable();
-
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -220,14 +218,11 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
static int update_netprio(const void *v, struct file *file, unsigned n)
{
- int err;
- struct socket *sock = sock_from_file(file, &err);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+ struct socket *sock = sock_from_file(file);
+
+ if (sock)
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
(unsigned long)v);
- spin_unlock(&cgroup_sk_update_lock);
- }
return 0;
}
diff --git a/net/core/of_net.c b/net/core/of_net.c
new file mode 100644
index 000000000000..f1a9bf7578e7
--- /dev/null
+++ b/net/core/of_net.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * OF helpers for network devices.
+ *
+ * Initially copied out of arch/powerpc/kernel/prom_parse.c
+ */
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/of_net.h>
+#include <linux/of_platform.h>
+#include <linux/phy.h>
+#include <linux/export.h>
+#include <linux/device.h>
+#include <linux/nvmem-consumer.h>
+
+/**
+ * of_get_phy_mode - Get phy mode for given device_node
+ * @np: Pointer to the given device_node
+ * @interface: Pointer to the result
+ *
+ * The function gets phy interface string from property 'phy-mode' or
+ * 'phy-connection-type'. The index in phy_modes table is set in
+ * interface and 0 returned. In case of error interface is set to
+ * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV.
+ */
+int of_get_phy_mode(struct device_node *np, phy_interface_t *interface)
+{
+ const char *pm;
+ int err, i;
+
+ *interface = PHY_INTERFACE_MODE_NA;
+
+ err = of_property_read_string(np, "phy-mode", &pm);
+ if (err < 0)
+ err = of_property_read_string(np, "phy-connection-type", &pm);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++)
+ if (!strcasecmp(pm, phy_modes(i))) {
+ *interface = i;
+ return 0;
+ }
+
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(of_get_phy_mode);
+
+static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr)
+{
+ struct property *pp = of_find_property(np, name, NULL);
+
+ if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) {
+ memcpy(addr, pp->value, ETH_ALEN);
+ return 0;
+ }
+ return -ENODEV;
+}
+
+static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
+{
+ struct platform_device *pdev = of_find_device_by_node(np);
+ struct nvmem_cell *cell;
+ const void *mac;
+ size_t len;
+ int ret;
+
+ /* Try lookup by device first, there might be a nvmem_cell_lookup
+ * associated with a given device.
+ */
+ if (pdev) {
+ ret = nvmem_get_mac_address(&pdev->dev, addr);
+ put_device(&pdev->dev);
+ return ret;
+ }
+
+ cell = of_nvmem_cell_get(np, "mac-address");
+ if (IS_ERR(cell))
+ return PTR_ERR(cell);
+
+ mac = nvmem_cell_read(cell, &len);
+ nvmem_cell_put(cell);
+
+ if (IS_ERR(mac))
+ return PTR_ERR(mac);
+
+ if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
+ kfree(mac);
+ return -EINVAL;
+ }
+
+ memcpy(addr, mac, ETH_ALEN);
+ kfree(mac);
+
+ return 0;
+}
+
+/**
+ * of_get_mac_address()
+ * @np: Caller's Device Node
+ * @addr: Pointer to a six-byte array for the result
+ *
+ * Search the device tree for the best MAC address to use. 'mac-address' is
+ * checked first, because that is supposed to contain to "most recent" MAC
+ * address. If that isn't set, then 'local-mac-address' is checked next,
+ * because that is the default address. If that isn't set, then the obsolete
+ * 'address' is checked, just in case we're using an old device tree. If any
+ * of the above isn't set, then try to get MAC address from nvmem cell named
+ * 'mac-address'.
+ *
+ * Note that the 'address' property is supposed to contain a virtual address of
+ * the register set, but some DTS files have redefined that property to be the
+ * MAC address.
+ *
+ * All-zero MAC addresses are rejected, because those could be properties that
+ * exist in the device tree, but were not set by U-Boot. For example, the
+ * DTS could define 'mac-address' and 'local-mac-address', with zero MAC
+ * addresses. Some older U-Boots only initialized 'local-mac-address'. In
+ * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
+ * but is all zeros.
+ *
+ * Return: 0 on success and errno in case of error.
+*/
+int of_get_mac_address(struct device_node *np, u8 *addr)
+{
+ int ret;
+
+ if (!np)
+ return -ENODEV;
+
+ ret = of_get_mac_addr(np, "mac-address", addr);
+ if (!ret)
+ return 0;
+
+ ret = of_get_mac_addr(np, "local-mac-address", addr);
+ if (!ret)
+ return 0;
+
+ ret = of_get_mac_addr(np, "address", addr);
+ if (!ret)
+ return 0;
+
+ return of_get_mac_addr_nvmem(np, addr);
+}
+EXPORT_SYMBOL(of_get_mac_address);
+
+/**
+ * of_get_ethdev_address()
+ * @np: Caller's Device Node
+ * @dev: Pointer to netdevice which address will be updated
+ *
+ * Search the device tree for the best MAC address to use.
+ * If found set @dev->dev_addr to that address.
+ *
+ * See documentation of of_get_mac_address() for more information on how
+ * the best address is determined.
+ *
+ * Return: 0 on success and errno in case of error.
+ */
+int of_get_ethdev_address(struct device_node *np, struct net_device *dev)
+{
+ u8 addr[ETH_ALEN];
+ int ret;
+
+ ret = of_get_mac_address(np, addr);
+ if (!ret)
+ eth_hw_addr_set(dev, addr);
+ return ret;
+}
+EXPORT_SYMBOL(of_get_ethdev_address);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 10d2b255df5e..9b203d8660e4 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -11,16 +11,128 @@
#include <linux/device.h>
#include <net/page_pool.h>
+#include <net/xdp.h>
+
#include <linux/dma-direction.h>
#include <linux/dma-mapping.h>
#include <linux/page-flags.h>
-#include <linux/mm.h> /* for __put_page() */
+#include <linux/mm.h> /* for put_page() */
+#include <linux/poison.h>
+#include <linux/ethtool.h>
#include <trace/events/page_pool.h>
#define DEFER_TIME (msecs_to_jiffies(1000))
#define DEFER_WARN_INTERVAL (60 * HZ)
+#define BIAS_MAX LONG_MAX
+
+#ifdef CONFIG_PAGE_POOL_STATS
+/* alloc_stat_inc is intended to be used in softirq context */
+#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
+/* recycle_stat_inc is safe to use when preemption is possible. */
+#define recycle_stat_inc(pool, __stat) \
+ do { \
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
+ this_cpu_inc(s->__stat); \
+ } while (0)
+
+#define recycle_stat_add(pool, __stat, val) \
+ do { \
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
+ this_cpu_add(s->__stat, val); \
+ } while (0)
+
+static const char pp_stats[][ETH_GSTRING_LEN] = {
+ "rx_pp_alloc_fast",
+ "rx_pp_alloc_slow",
+ "rx_pp_alloc_slow_ho",
+ "rx_pp_alloc_empty",
+ "rx_pp_alloc_refill",
+ "rx_pp_alloc_waive",
+ "rx_pp_recycle_cached",
+ "rx_pp_recycle_cache_full",
+ "rx_pp_recycle_ring",
+ "rx_pp_recycle_ring_full",
+ "rx_pp_recycle_released_ref",
+};
+
+bool page_pool_get_stats(struct page_pool *pool,
+ struct page_pool_stats *stats)
+{
+ int cpu = 0;
+
+ if (!stats)
+ return false;
+
+ /* The caller is responsible to initialize stats. */
+ stats->alloc_stats.fast += pool->alloc_stats.fast;
+ stats->alloc_stats.slow += pool->alloc_stats.slow;
+ stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
+ stats->alloc_stats.empty += pool->alloc_stats.empty;
+ stats->alloc_stats.refill += pool->alloc_stats.refill;
+ stats->alloc_stats.waive += pool->alloc_stats.waive;
+
+ for_each_possible_cpu(cpu) {
+ const struct page_pool_recycle_stats *pcpu =
+ per_cpu_ptr(pool->recycle_stats, cpu);
+
+ stats->recycle_stats.cached += pcpu->cached;
+ stats->recycle_stats.cache_full += pcpu->cache_full;
+ stats->recycle_stats.ring += pcpu->ring;
+ stats->recycle_stats.ring_full += pcpu->ring_full;
+ stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(page_pool_get_stats);
+
+u8 *page_pool_ethtool_stats_get_strings(u8 *data)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
+ memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
+ data += ETH_GSTRING_LEN;
+ }
+
+ return data;
+}
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
+
+int page_pool_ethtool_stats_get_count(void)
+{
+ return ARRAY_SIZE(pp_stats);
+}
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
+
+u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
+{
+ struct page_pool_stats *pool_stats = stats;
+
+ *data++ = pool_stats->alloc_stats.fast;
+ *data++ = pool_stats->alloc_stats.slow;
+ *data++ = pool_stats->alloc_stats.slow_high_order;
+ *data++ = pool_stats->alloc_stats.empty;
+ *data++ = pool_stats->alloc_stats.refill;
+ *data++ = pool_stats->alloc_stats.waive;
+ *data++ = pool_stats->recycle_stats.cached;
+ *data++ = pool_stats->recycle_stats.cache_full;
+ *data++ = pool_stats->recycle_stats.ring;
+ *data++ = pool_stats->recycle_stats.ring_full;
+ *data++ = pool_stats->recycle_stats.released_refcnt;
+
+ return data;
+}
+EXPORT_SYMBOL(page_pool_ethtool_stats_get);
+
+#else
+#define alloc_stat_inc(pool, __stat)
+#define recycle_stat_inc(pool, __stat)
+#define recycle_stat_add(pool, __stat, val)
+#endif
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -43,9 +155,11 @@ static int page_pool_init(struct page_pool *pool,
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
* which is the XDP_TX use-case.
*/
- if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
- (pool->p.dma_dir != DMA_BIDIRECTIONAL))
- return -EINVAL;
+ if (pool->p.flags & PP_FLAG_DMA_MAP) {
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+ }
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
/* In order to request DMA-sync-for-device the page
@@ -62,6 +176,16 @@ static int page_pool_init(struct page_pool *pool,
*/
}
+ if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
+ pool->p.flags & PP_FLAG_PAGE_FRAG)
+ return -EINVAL;
+
+#ifdef CONFIG_PAGE_POOL_STATS
+ pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
+ if (!pool->recycle_stats)
+ return -ENOMEM;
+#endif
+
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
@@ -96,7 +220,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
}
EXPORT_SYMBOL(page_pool_create);
-static void __page_pool_return_page(struct page_pool *pool, struct page *page);
+static void page_pool_return_page(struct page_pool *pool, struct page *page);
noinline
static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
@@ -106,8 +230,10 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
int pref_nid; /* preferred NUMA node */
/* Quicker fallback, avoid locks when ring is empty */
- if (__ptr_ring_empty(r))
+ if (__ptr_ring_empty(r)) {
+ alloc_stat_inc(pool, empty);
return NULL;
+ }
/* Softirq guarantee CPU and thus NUMA node is stable. This,
* assumes CPU refilling driver RX-ring will also run RX-NAPI.
@@ -119,9 +245,6 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
#endif
- /* Slower-path: Get pages from locked ring queue */
- spin_lock(&r->consumer_lock);
-
/* Refill alloc array, but only if NUMA match */
do {
page = __ptr_ring_consume(r);
@@ -136,17 +259,19 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
* (2) break out to fallthrough to alloc_pages_node.
* This limit stress on page buddy alloactor.
*/
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
+ alloc_stat_inc(pool, waive);
page = NULL;
break;
}
} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
/* Return last page */
- if (likely(pool->alloc.count > 0))
+ if (likely(pool->alloc.count > 0)) {
page = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, refill);
+ }
- spin_unlock(&r->consumer_lock);
return page;
}
@@ -159,6 +284,7 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
if (likely(pool->alloc.count)) {
/* Fast-path */
page = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, fast);
} else {
page = page_pool_refill_alloc_cache(pool);
}
@@ -170,46 +296,18 @@ static void page_pool_dma_sync_for_device(struct page_pool *pool,
struct page *page,
unsigned int dma_sync_size)
{
+ dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+
dma_sync_size = min(dma_sync_size, pool->p.max_len);
- dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+ dma_sync_single_range_for_device(pool->p.dev, dma_addr,
pool->p.offset, dma_sync_size,
pool->p.dma_dir);
}
-/* slow path */
-noinline
-static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
- gfp_t _gfp)
+static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
{
- struct page *page;
- gfp_t gfp = _gfp;
dma_addr_t dma;
- /* We could always set __GFP_COMP, and avoid this branch, as
- * prep_new_page() can handle order-0 with __GFP_COMP.
- */
- if (pool->p.order)
- gfp |= __GFP_COMP;
-
- /* FUTURE development:
- *
- * Current slow-path essentially falls back to single page
- * allocations, which doesn't improve performance. This code
- * need bulk allocation support from the page allocator code.
- */
-
- /* Cache was empty, do real allocation */
-#ifdef CONFIG_NUMA
- page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
-#else
- page = alloc_pages(gfp, pool->p.order);
-#endif
- if (!page)
- return NULL;
-
- if (!(pool->p.flags & PP_FLAG_DMA_MAP))
- goto skip_dma_map;
-
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
* since dma_addr_t can be either 32 or 64 bits and does not always fit
* into page private data (i.e 32bit cpu with 64bit DMA caps)
@@ -218,20 +316,110 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
dma = dma_map_page_attrs(pool->p.dev, page, 0,
(PAGE_SIZE << pool->p.order),
pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
- if (dma_mapping_error(pool->p.dev, dma)) {
+ if (dma_mapping_error(pool->p.dev, dma))
+ return false;
+
+ page_pool_set_dma_addr(page, dma);
+
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+ page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+
+ return true;
+}
+
+static void page_pool_set_pp_info(struct page_pool *pool,
+ struct page *page)
+{
+ page->pp = pool;
+ page->pp_magic |= PP_SIGNATURE;
+ if (pool->p.init_callback)
+ pool->p.init_callback(page, pool->p.init_arg);
+}
+
+static void page_pool_clear_pp_info(struct page *page)
+{
+ page->pp_magic = 0;
+ page->pp = NULL;
+}
+
+static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
+ gfp_t gfp)
+{
+ struct page *page;
+
+ gfp |= __GFP_COMP;
+ page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+ if (unlikely(!page))
+ return NULL;
+
+ if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
+ unlikely(!page_pool_dma_map(pool, page))) {
put_page(page);
return NULL;
}
- page->dma_addr = dma;
- if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
- page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+ alloc_stat_inc(pool, slow_high_order);
+ page_pool_set_pp_info(pool, page);
-skip_dma_map:
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
-
trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+ return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
+ gfp_t gfp)
+{
+ const int bulk = PP_ALLOC_CACHE_REFILL;
+ unsigned int pp_flags = pool->p.flags;
+ unsigned int pp_order = pool->p.order;
+ struct page *page;
+ int i, nr_pages;
+
+ /* Don't support bulk alloc for high-order pages */
+ if (unlikely(pp_order))
+ return __page_pool_alloc_page_order(pool, gfp);
+
+ /* Unnecessary as alloc cache is empty, but guarantees zero count */
+ if (unlikely(pool->alloc.count > 0))
+ return pool->alloc.cache[--pool->alloc.count];
+
+ /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
+ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
+
+ nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
+ pool->alloc.cache);
+ if (unlikely(!nr_pages))
+ return NULL;
+
+ /* Pages have been filled into alloc.cache array, but count is zero and
+ * page element have not been (possibly) DMA mapped.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ page = pool->alloc.cache[i];
+ if ((pp_flags & PP_FLAG_DMA_MAP) &&
+ unlikely(!page_pool_dma_map(pool, page))) {
+ put_page(page);
+ continue;
+ }
+
+ page_pool_set_pp_info(pool, page);
+ pool->alloc.cache[pool->alloc.count++] = page;
+ /* Track how many pages are held 'in-flight' */
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, page,
+ pool->pages_state_hold_cnt);
+ }
+
+ /* Return last page */
+ if (likely(pool->alloc.count > 0)) {
+ page = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, slow);
+ } else {
+ page = NULL;
+ }
/* When page just alloc'ed is should/must have refcnt 1. */
return page;
@@ -274,44 +462,44 @@ static s32 page_pool_inflight(struct page_pool *pool)
return inflight;
}
-/* Cleanup page_pool state from page */
-static void __page_pool_clean_page(struct page_pool *pool,
- struct page *page)
+/* Disconnects a page (from a page_pool). API users can have a need
+ * to disconnect a page (from a page_pool), to allow it to be used as
+ * a regular page (that will eventually be returned to the normal
+ * page-allocator via put_page).
+ */
+void page_pool_release_page(struct page_pool *pool, struct page *page)
{
dma_addr_t dma;
int count;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ /* Always account for inflight pages, even if we didn't
+ * map them
+ */
goto skip_dma_unmap;
- dma = page->dma_addr;
- /* DMA unmap */
+ dma = page_pool_get_dma_addr(page);
+
+ /* When page is unmapped, it cannot be returned to our pool */
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
- page->dma_addr = 0;
+ page_pool_set_dma_addr(page, 0);
skip_dma_unmap:
+ page_pool_clear_pp_info(page);
+
/* This may be the last page returned, releasing the pool, so
* it is not safe to reference pool afterwards.
*/
- count = atomic_inc_return(&pool->pages_state_release_cnt);
+ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
trace_page_pool_state_release(pool, page, count);
}
-
-/* unmap the page and clean our state */
-void page_pool_unmap_page(struct page_pool *pool, struct page *page)
-{
- /* When page is unmapped, this implies page will not be
- * returned to page_pool.
- */
- __page_pool_clean_page(pool, page);
-}
-EXPORT_SYMBOL(page_pool_unmap_page);
+EXPORT_SYMBOL(page_pool_release_page);
/* Return a page to the page allocator, cleaning up our state */
-static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+static void page_pool_return_page(struct page_pool *pool, struct page *page)
{
- __page_pool_clean_page(pool, page);
+ page_pool_release_page(pool, page);
put_page(page);
/* An optimization would be to call __free_pages(page, pool->p.order)
@@ -320,8 +508,7 @@ static void __page_pool_return_page(struct page_pool *pool, struct page *page)
*/
}
-static bool __page_pool_recycle_into_ring(struct page_pool *pool,
- struct page *page)
+static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
{
int ret;
/* BH protection not needed if current is serving softirq */
@@ -330,7 +517,12 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool,
else
ret = ptr_ring_produce_bh(&pool->ring, page);
- return (ret == 0) ? true : false;
+ if (!ret) {
+ recycle_stat_inc(pool, ring);
+ return true;
+ }
+
+ return false;
}
/* Only allow direct recycling in special circumstances, into the
@@ -338,51 +530,52 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool,
*
* Caller must provide appropriate safe context.
*/
-static bool __page_pool_recycle_direct(struct page *page,
+static bool page_pool_recycle_in_cache(struct page *page,
struct page_pool *pool)
{
- if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
+ recycle_stat_inc(pool, cache_full);
return false;
+ }
/* Caller MUST have verified/know (page_ref_count(page) == 1) */
pool->alloc.cache[pool->alloc.count++] = page;
+ recycle_stat_inc(pool, cached);
return true;
}
-/* page is NOT reusable when:
- * 1) allocated when system is under some pressure. (page_is_pfmemalloc)
+/* If the page refcnt == 1, this will try to recycle the page.
+ * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
+ * the configured size min(dma_sync_size, pool->max_len).
+ * If the page refcnt != 1, then the page will be returned to memory
+ * subsystem.
*/
-static bool pool_page_reusable(struct page_pool *pool, struct page *page)
-{
- return !page_is_pfmemalloc(page);
-}
-
-void __page_pool_put_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+static __always_inline struct page *
+__page_pool_put_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
* regular page allocator APIs.
*
* refcnt == 1 means page_pool owns page, and can recycle it.
+ *
+ * page is NOT reusable when allocated when system is under
+ * some pressure. (page_is_pfmemalloc)
*/
- if (likely(page_ref_count(page) == 1 &&
- pool_page_reusable(pool, page))) {
+ if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
/* Read barrier done in page_ref_count / READ_ONCE */
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
page_pool_dma_sync_for_device(pool, page,
dma_sync_size);
- if (allow_direct && in_serving_softirq())
- if (__page_pool_recycle_direct(page, pool))
- return;
+ if (allow_direct && in_serving_softirq() &&
+ page_pool_recycle_in_cache(page, pool))
+ return NULL;
- if (!__page_pool_recycle_into_ring(pool, page)) {
- /* Cache full, fallback to free pages */
- __page_pool_return_page(pool, page);
- }
- return;
+ /* Page found as candidate for recycling */
+ return page;
}
/* Fallback/non-XDP mode: API user have elevated refcnt.
*
@@ -397,12 +590,152 @@ void __page_pool_put_page(struct page_pool *pool, struct page *page,
* doing refcnt based recycle tricks, meaning another process
* will be invoking put_page.
*/
- __page_pool_clean_page(pool, page);
+ recycle_stat_inc(pool, released_refcnt);
+ /* Do not replace this with page_pool_return_page() */
+ page_pool_release_page(pool, page);
put_page(page);
+
+ return NULL;
+}
+
+void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
+ if (page && !page_pool_recycle_in_ring(pool, page)) {
+ /* Cache full, fallback to free pages */
+ recycle_stat_inc(pool, ring_full);
+ page_pool_return_page(pool, page);
+ }
+}
+EXPORT_SYMBOL(page_pool_put_defragged_page);
+
+/* Caller must not use data area after call, as this function overwrites it */
+void page_pool_put_page_bulk(struct page_pool *pool, void **data,
+ int count)
+{
+ int i, bulk_len = 0;
+
+ for (i = 0; i < count; i++) {
+ struct page *page = virt_to_head_page(data[i]);
+
+ /* It is not the last user for the page frag case */
+ if (!page_pool_is_last_frag(pool, page))
+ continue;
+
+ page = __page_pool_put_page(pool, page, -1, false);
+ /* Approved for bulk recycling in ptr_ring cache */
+ if (page)
+ data[bulk_len++] = page;
+ }
+
+ if (unlikely(!bulk_len))
+ return;
+
+ /* Bulk producer into ptr_ring page_pool cache */
+ page_pool_ring_lock(pool);
+ for (i = 0; i < bulk_len; i++) {
+ if (__ptr_ring_produce(&pool->ring, data[i])) {
+ /* ring full */
+ recycle_stat_inc(pool, ring_full);
+ break;
+ }
+ }
+ recycle_stat_add(pool, ring, i);
+ page_pool_ring_unlock(pool);
+
+ /* Hopefully all pages was return into ptr_ring */
+ if (likely(i == bulk_len))
+ return;
+
+ /* ptr_ring cache full, free remaining pages outside producer lock
+ * since put_page() with refcnt == 1 can be an expensive operation
+ */
+ for (; i < bulk_len; i++)
+ page_pool_return_page(pool, data[i]);
}
-EXPORT_SYMBOL(__page_pool_put_page);
+EXPORT_SYMBOL(page_pool_put_page_bulk);
-static void __page_pool_empty_ring(struct page_pool *pool)
+static struct page *page_pool_drain_frag(struct page_pool *pool,
+ struct page *page)
+{
+ long drain_count = BIAS_MAX - pool->frag_users;
+
+ /* Some user is still using the page frag */
+ if (likely(page_pool_defrag_page(page, drain_count)))
+ return NULL;
+
+ if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+ page_pool_dma_sync_for_device(pool, page, -1);
+
+ return page;
+ }
+
+ page_pool_return_page(pool, page);
+ return NULL;
+}
+
+static void page_pool_free_frag(struct page_pool *pool)
+{
+ long drain_count = BIAS_MAX - pool->frag_users;
+ struct page *page = pool->frag_page;
+
+ pool->frag_page = NULL;
+
+ if (!page || page_pool_defrag_page(page, drain_count))
+ return;
+
+ page_pool_return_page(pool, page);
+}
+
+struct page *page_pool_alloc_frag(struct page_pool *pool,
+ unsigned int *offset,
+ unsigned int size, gfp_t gfp)
+{
+ unsigned int max_size = PAGE_SIZE << pool->p.order;
+ struct page *page = pool->frag_page;
+
+ if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
+ size > max_size))
+ return NULL;
+
+ size = ALIGN(size, dma_get_cache_alignment());
+ *offset = pool->frag_offset;
+
+ if (page && *offset + size > max_size) {
+ page = page_pool_drain_frag(pool, page);
+ if (page) {
+ alloc_stat_inc(pool, fast);
+ goto frag_reset;
+ }
+ }
+
+ if (!page) {
+ page = page_pool_alloc_pages(pool, gfp);
+ if (unlikely(!page)) {
+ pool->frag_page = NULL;
+ return NULL;
+ }
+
+ pool->frag_page = page;
+
+frag_reset:
+ pool->frag_users = 1;
+ *offset = 0;
+ pool->frag_offset = size;
+ page_pool_fragment_page(page, BIAS_MAX);
+ return page;
+ }
+
+ pool->frag_users++;
+ pool->frag_offset = *offset + size;
+ alloc_stat_inc(pool, fast);
+ return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_frag);
+
+static void page_pool_empty_ring(struct page_pool *pool)
{
struct page *page;
@@ -413,7 +746,7 @@ static void __page_pool_empty_ring(struct page_pool *pool)
pr_crit("%s() page_pool refcnt %d violation\n",
__func__, page_ref_count(page));
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
}
@@ -427,6 +760,9 @@ static void page_pool_free(struct page_pool *pool)
if (pool->p.flags & PP_FLAG_DMA_MAP)
put_device(pool->p.dev);
+#ifdef CONFIG_PAGE_POOL_STATS
+ free_percpu(pool->recycle_stats);
+#endif
kfree(pool);
}
@@ -443,7 +779,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
*/
while (pool->alloc.count) {
page = pool->alloc.cache[--pool->alloc.count];
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
}
@@ -455,7 +791,7 @@ static void page_pool_scrub(struct page_pool *pool)
/* No more consumers should exist, but producers could still
* be in-flight.
*/
- __page_pool_empty_ring(pool);
+ page_pool_empty_ring(pool);
}
static int page_pool_release(struct page_pool *pool)
@@ -493,10 +829,12 @@ static void page_pool_release_retry(struct work_struct *wq)
schedule_delayed_work(&pool->release_dw, DEFER_TIME);
}
-void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+ struct xdp_mem_info *mem)
{
refcount_inc(&pool->user_cnt);
pool->disconnect = disconnect;
+ pool->xdp_mem_id = mem->id;
}
void page_pool_destroy(struct page_pool *pool)
@@ -507,6 +845,8 @@ void page_pool_destroy(struct page_pool *pool)
if (!page_pool_put(pool))
return;
+ page_pool_free_frag(pool);
+
if (!page_pool_release(pool))
return;
@@ -529,7 +869,36 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
/* Flush pool alloc cache, as refill will check NUMA node */
while (pool->alloc.count) {
page = pool->alloc.cache[--pool->alloc.count];
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
}
EXPORT_SYMBOL(page_pool_update_nid);
+
+bool page_pool_return_skb_page(struct page *page)
+{
+ struct page_pool *pp;
+
+ page = compound_head(page);
+
+ /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
+ * in order to preserve any existing bits, such as bit 0 for the
+ * head page of compound page and bit 1 for pfmemalloc page, so
+ * mask those bits for freeing side when doing below checking,
+ * and page_is_pfmemalloc() is checked in __page_pool_put_page()
+ * to avoid recycling the pfmemalloc page.
+ */
+ if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+ return false;
+
+ pp = page->pp;
+
+ /* Driver set this to memory recycling info. Reset it on recycle.
+ * This will *not* work for NIC using a split-page memory model.
+ * The page will be returned to the pool here regardless of the
+ * 'flipped' fragment being in use or not.
+ */
+ page_pool_put_full_page(pp, page, false);
+
+ return true;
+}
+EXPORT_SYMBOL(page_pool_return_skb_page);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index acc849df60b5..c3763056c554 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -56,7 +56,7 @@
* Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
*
* 021124 Finished major redesign and rewrite for new functionality.
- * See Documentation/networking/pktgen.txt for how to use this.
+ * See Documentation/networking/pktgen.rst for how to use this.
*
* The new operation:
* For each CPU one thread/process is created at start. This process checks
@@ -175,6 +175,9 @@
#define IP_NAME_SZ 32
#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
#define MPLS_STACK_BOTTOM htonl(0x00000100)
+/* Max number of internet mix entries that can be specified in imix_weights. */
+#define MAX_IMIX_ENTRIES 20
+#define IMIX_PRECISION 100 /* Precision of IMIX distribution */
#define func_enter() pr_debug("entering %s\n", __func__);
@@ -242,6 +245,12 @@ static char *pkt_flag_names[] = {
#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4)
#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4)
+struct imix_pkt {
+ u64 size;
+ u64 weight;
+ u64 count_so_far;
+};
+
struct flow_state {
__be32 cur_daddr;
int count;
@@ -343,6 +352,12 @@ struct pktgen_dev {
__u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
(see RFC 3260, sec. 4) */
+ /* IMIX */
+ unsigned int n_imix_entries;
+ struct imix_pkt imix_entries[MAX_IMIX_ENTRIES];
+ /* Maps 0-IMIX_PRECISION range to imix_entry based on probability*/
+ __u8 imix_distribution[IMIX_PRECISION];
+
/* MPLS */
unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */
__be32 labels[MAX_MPLS_LABELS];
@@ -395,6 +410,7 @@ struct pktgen_dev {
* device name (not when the inject is
* started as it used to do.)
*/
+ netdevice_tracker dev_tracker;
char odevname[32];
struct flow_state *flows;
unsigned int cflows; /* Concurrent flows (config) */
@@ -467,10 +483,11 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
static void pktgen_run_all_threads(struct pktgen_net *pn);
static void pktgen_reset_all_threads(struct pktgen_net *pn);
-static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);
+static void pktgen_stop_all_threads(struct pktgen_net *pn);
static void pktgen_stop(struct pktgen_thread *t);
static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
+static void fill_imix_distribution(struct pktgen_dev *pkt_dev);
/* Module parameters, defaults. */
static int pg_count_d __read_mostly = 1000;
@@ -516,14 +533,11 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
if (!strcmp(data, "stop"))
- pktgen_stop_all_threads_ifs(pn);
-
+ pktgen_stop_all_threads(pn);
else if (!strcmp(data, "start"))
pktgen_run_all_threads(pn);
-
else if (!strcmp(data, "reset"))
pktgen_reset_all_threads(pn);
-
else
return -EINVAL;
@@ -532,7 +546,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
static int pgctrl_open(struct inode *inode, struct file *file)
{
- return single_open(file, pgctrl_show, PDE_DATA(inode));
+ return single_open(file, pgctrl_show, pde_data(inode));
}
static const struct proc_ops pktgen_proc_ops = {
@@ -555,6 +569,16 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
(unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size,
pkt_dev->max_pkt_size);
+ if (pkt_dev->n_imix_entries > 0) {
+ seq_puts(seq, " imix_weights: ");
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ seq_printf(seq, "%llu,%llu ",
+ pkt_dev->imix_entries[i].size,
+ pkt_dev->imix_entries[i].weight);
+ }
+ seq_puts(seq, "\n");
+ }
+
seq_printf(seq,
" frags: %d delay: %llu clone_skb: %d ifname: %s\n",
pkt_dev->nfrags, (unsigned long long) pkt_dev->delay,
@@ -672,6 +696,18 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
(unsigned long long)pkt_dev->sofar,
(unsigned long long)pkt_dev->errors);
+ if (pkt_dev->n_imix_entries > 0) {
+ int i;
+
+ seq_puts(seq, " imix_size_counts: ");
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ seq_printf(seq, "%llu,%llu ",
+ pkt_dev->imix_entries[i].size,
+ pkt_dev->imix_entries[i].count_so_far);
+ }
+ seq_puts(seq, "\n");
+ }
+
seq_printf(seq,
" started: %lluus stopped: %lluus idle: %lluus\n",
(unsigned long long) ktime_to_us(pkt_dev->started_at),
@@ -795,6 +831,62 @@ done_str:
return i;
}
+/* Parses imix entries from user buffer.
+ * The user buffer should consist of imix entries separated by spaces
+ * where each entry consists of size and weight delimited by commas.
+ * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example.
+ */
+static ssize_t get_imix_entries(const char __user *buffer,
+ struct pktgen_dev *pkt_dev)
+{
+ const int max_digits = 10;
+ int i = 0;
+ long len;
+ char c;
+
+ pkt_dev->n_imix_entries = 0;
+
+ do {
+ unsigned long weight;
+ unsigned long size;
+
+ len = num_arg(&buffer[i], max_digits, &size);
+ if (len < 0)
+ return len;
+ i += len;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ /* Check for comma between size_i and weight_i */
+ if (c != ',')
+ return -EINVAL;
+ i++;
+
+ if (size < 14 + 20 + 8)
+ size = 14 + 20 + 8;
+
+ len = num_arg(&buffer[i], max_digits, &weight);
+ if (len < 0)
+ return len;
+ if (weight <= 0)
+ return -EINVAL;
+
+ pkt_dev->imix_entries[pkt_dev->n_imix_entries].size = size;
+ pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight;
+
+ i += len;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+
+ i++;
+ pkt_dev->n_imix_entries++;
+
+ if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES)
+ return -E2BIG;
+ } while (c == ' ');
+
+ return i;
+}
+
static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
{
unsigned int n = 0;
@@ -922,7 +1014,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->min_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: min_pkt_size=%u",
+ sprintf(pg_result, "OK: min_pkt_size=%d",
pkt_dev->min_pkt_size);
return count;
}
@@ -939,7 +1031,7 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->max_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: max_pkt_size=%u",
+ sprintf(pg_result, "OK: max_pkt_size=%d",
pkt_dev->max_pkt_size);
return count;
}
@@ -959,7 +1051,21 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->max_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
+ sprintf(pg_result, "OK: pkt_size=%d", pkt_dev->min_pkt_size);
+ return count;
+ }
+
+ if (!strcmp(name, "imix_weights")) {
+ if (pkt_dev->clone_skb > 0)
+ return -EINVAL;
+
+ len = get_imix_entries(&user_buffer[i], pkt_dev);
+ if (len < 0)
+ return len;
+
+ fill_imix_distribution(pkt_dev);
+
+ i += len;
return count;
}
@@ -981,7 +1087,7 @@ static ssize_t pktgen_if_write(struct file *file,
i += len;
pkt_dev->nfrags = value;
- sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
+ sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags);
return count;
}
if (!strcmp(name, "delay")) {
@@ -1085,10 +1191,16 @@ static ssize_t pktgen_if_write(struct file *file,
len = num_arg(&user_buffer[i], 10, &value);
if (len < 0)
return len;
+ /* clone_skb is not supported for netif_receive xmit_mode and
+ * IMIX mode.
+ */
if ((value > 0) &&
((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
+ if (value > 0 && pkt_dev->n_imix_entries > 0)
+ return -EINVAL;
+
i += len;
pkt_dev->clone_skb = value;
@@ -1146,7 +1258,7 @@ static ssize_t pktgen_if_write(struct file *file,
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
return -ENOTSUPP;
pkt_dev->burst = value < 1 ? 1 : value;
- sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
+ sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
return count;
}
if (!strcmp(name, "node")) {
@@ -1193,11 +1305,6 @@ static ssize_t pktgen_if_write(struct file *file,
* pktgen_xmit() is called
*/
pkt_dev->last_ok = 1;
-
- /* override clone_skb if user passed default value
- * at module loading time
- */
- pkt_dev->clone_skb = 0;
} else if (strcmp(f, "queue_xmit") == 0) {
pkt_dev->xmit_mode = M_QUEUE_XMIT;
pkt_dev->last_ok = 1;
@@ -1704,7 +1811,7 @@ static ssize_t pktgen_if_write(struct file *file,
static int pktgen_if_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_if_show, PDE_DATA(inode));
+ return single_open(file, pktgen_if_show, pde_data(inode));
}
static const struct proc_ops pktgen_if_proc_ops = {
@@ -1841,7 +1948,7 @@ out:
static int pktgen_thread_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_thread_show, PDE_DATA(inode));
+ return single_open(file, pktgen_thread_show, pde_data(inode));
}
static const struct proc_ops pktgen_thread_proc_ops = {
@@ -1993,7 +2100,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn,
/* Clean old setups */
if (pkt_dev->odev) {
- dev_put(pkt_dev->odev);
+ netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
pkt_dev->odev = NULL;
}
@@ -2003,14 +2110,15 @@ static int pktgen_setup_dev(const struct pktgen_net *pn,
return -ENODEV;
}
- if (odev->type != ARPHRD_ETHER) {
- pr_err("not an ethernet device: \"%s\"\n", ifname);
+ if (odev->type != ARPHRD_ETHER && odev->type != ARPHRD_LOOPBACK) {
+ pr_err("not an ethernet or loopback device: \"%s\"\n", ifname);
err = -EINVAL;
} else if (!netif_running(odev)) {
pr_err("device is down: \"%s\"\n", ifname);
err = -ENETDOWN;
} else {
pkt_dev->odev = odev;
+ netdev_tracker_alloc(odev, &pkt_dev->dev_tracker, GFP_KERNEL);
return 0;
}
@@ -2216,7 +2324,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
pkt_dev->curfl = 0; /*reset */
}
} else {
- flow = prandom_u32() % pkt_dev->cflows;
+ flow = prandom_u32_max(pkt_dev->cflows);
pkt_dev->curfl = flow;
if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
@@ -2272,10 +2380,9 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
__u16 t;
if (pkt_dev->flags & F_QUEUE_MAP_RND) {
- t = prandom_u32() %
- (pkt_dev->queue_map_max -
- pkt_dev->queue_map_min + 1)
- + pkt_dev->queue_map_min;
+ t = prandom_u32_max(pkt_dev->queue_map_max -
+ pkt_dev->queue_map_min + 1) +
+ pkt_dev->queue_map_min;
} else {
t = pkt_dev->cur_queue_map + 1;
if (t > pkt_dev->queue_map_max)
@@ -2304,7 +2411,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACSRC_RND)
- mc = prandom_u32() % pkt_dev->src_mac_count;
+ mc = prandom_u32_max(pkt_dev->src_mac_count);
else {
mc = pkt_dev->cur_src_mac_offset++;
if (pkt_dev->cur_src_mac_offset >=
@@ -2330,7 +2437,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACDST_RND)
- mc = prandom_u32() % pkt_dev->dst_mac_count;
+ mc = prandom_u32_max(pkt_dev->dst_mac_count);
else {
mc = pkt_dev->cur_dst_mac_offset++;
@@ -2357,23 +2464,23 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
for (i = 0; i < pkt_dev->nr_labels; i++)
if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
- ((__force __be32)prandom_u32() &
+ ((__force __be32)get_random_u32() &
htonl(0x000fffff));
}
if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
- pkt_dev->vlan_id = prandom_u32() & (4096 - 1);
+ pkt_dev->vlan_id = prandom_u32_max(4096);
}
if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
- pkt_dev->svlan_id = prandom_u32() & (4096 - 1);
+ pkt_dev->svlan_id = prandom_u32_max(4096);
}
if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
if (pkt_dev->flags & F_UDPSRC_RND)
- pkt_dev->cur_udp_src = prandom_u32() %
- (pkt_dev->udp_src_max - pkt_dev->udp_src_min)
- + pkt_dev->udp_src_min;
+ pkt_dev->cur_udp_src = prandom_u32_max(
+ pkt_dev->udp_src_max - pkt_dev->udp_src_min) +
+ pkt_dev->udp_src_min;
else {
pkt_dev->cur_udp_src++;
@@ -2384,9 +2491,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
if (pkt_dev->flags & F_UDPDST_RND) {
- pkt_dev->cur_udp_dst = prandom_u32() %
- (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
- + pkt_dev->udp_dst_min;
+ pkt_dev->cur_udp_dst = prandom_u32_max(
+ pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) +
+ pkt_dev->udp_dst_min;
} else {
pkt_dev->cur_udp_dst++;
if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
@@ -2401,7 +2508,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (imn < imx) {
__u32 t;
if (pkt_dev->flags & F_IPSRC_RND)
- t = prandom_u32() % (imx - imn) + imn;
+ t = prandom_u32_max(imx - imn) + imn;
else {
t = ntohl(pkt_dev->cur_saddr);
t++;
@@ -2423,8 +2530,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->flags & F_IPDST_RND) {
do {
- t = prandom_u32() %
- (imx - imn) + imn;
+ t = prandom_u32_max(imx - imn) +
+ imn;
s = htonl(t);
} while (ipv4_is_loopback(s) ||
ipv4_is_multicast(s) ||
@@ -2461,7 +2568,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
for (i = 0; i < 4; i++) {
pkt_dev->cur_in6_daddr.s6_addr32[i] =
- (((__force __be32)prandom_u32() |
+ (((__force __be32)get_random_u32() |
pkt_dev->min_in6_daddr.s6_addr32[i]) &
pkt_dev->max_in6_daddr.s6_addr32[i]);
}
@@ -2471,15 +2578,23 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
__u32 t;
if (pkt_dev->flags & F_TXSIZE_RND) {
- t = prandom_u32() %
- (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
- + pkt_dev->min_pkt_size;
+ t = prandom_u32_max(pkt_dev->max_pkt_size -
+ pkt_dev->min_pkt_size) +
+ pkt_dev->min_pkt_size;
} else {
t = pkt_dev->cur_pkt_size + 1;
if (t > pkt_dev->max_pkt_size)
t = pkt_dev->min_pkt_size;
}
pkt_dev->cur_pkt_size = t;
+ } else if (pkt_dev->n_imix_entries > 0) {
+ struct imix_pkt *entry;
+ __u32 t = prandom_u32_max(IMIX_PRECISION);
+ __u8 entry_index = pkt_dev->imix_distribution[t];
+
+ entry = &pkt_dev->imix_entries[entry_index];
+ entry->count_so_far++;
+ pkt_dev->cur_pkt_size = entry->size;
}
set_cur_queue_map(pkt_dev);
@@ -2487,6 +2602,32 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
pkt_dev->flows[flow].count++;
}
+static void fill_imix_distribution(struct pktgen_dev *pkt_dev)
+{
+ int cumulative_probabilites[MAX_IMIX_ENTRIES];
+ int j = 0;
+ __u64 cumulative_prob = 0;
+ __u64 total_weight = 0;
+ int i = 0;
+
+ for (i = 0; i < pkt_dev->n_imix_entries; i++)
+ total_weight += pkt_dev->imix_entries[i].weight;
+
+ /* Fill cumulative_probabilites with sum of normalized probabilities */
+ for (i = 0; i < pkt_dev->n_imix_entries - 1; i++) {
+ cumulative_prob += div64_u64(pkt_dev->imix_entries[i].weight *
+ IMIX_PRECISION,
+ total_weight);
+ cumulative_probabilites[i] = cumulative_prob;
+ }
+ cumulative_probabilites[pkt_dev->n_imix_entries - 1] = 100;
+
+ for (i = 0; i < IMIX_PRECISION; i++) {
+ if (i == cumulative_probabilites[j])
+ j++;
+ pkt_dev->imix_distribution[i] = j;
+ }
+}
#ifdef CONFIG_XFRM
static u32 pktgen_dst_metrics[RTAX_MAX + 1] = {
@@ -3027,20 +3168,25 @@ static void pktgen_run(struct pktgen_thread *t)
t->control &= ~(T_STOP);
}
-static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)
+static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags)
{
struct pktgen_thread *t;
- func_enter();
-
mutex_lock(&pktgen_thread_lock);
list_for_each_entry(t, &pn->pktgen_threads, th_list)
- t->control |= T_STOP;
+ t->control |= (flags);
mutex_unlock(&pktgen_thread_lock);
}
+static void pktgen_stop_all_threads(struct pktgen_net *pn)
+{
+ func_enter();
+
+ pktgen_handle_all_threads(pn, T_STOP);
+}
+
static int thread_is_running(const struct pktgen_thread *t)
{
const struct pktgen_dev *pkt_dev;
@@ -3103,16 +3249,9 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
static void pktgen_run_all_threads(struct pktgen_net *pn)
{
- struct pktgen_thread *t;
-
func_enter();
- mutex_lock(&pktgen_thread_lock);
-
- list_for_each_entry(t, &pn->pktgen_threads, th_list)
- t->control |= (T_RUN);
-
- mutex_unlock(&pktgen_thread_lock);
+ pktgen_handle_all_threads(pn, T_RUN);
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));
@@ -3122,16 +3261,9 @@ static void pktgen_run_all_threads(struct pktgen_net *pn)
static void pktgen_reset_all_threads(struct pktgen_net *pn)
{
- struct pktgen_thread *t;
-
func_enter();
- mutex_lock(&pktgen_thread_lock);
-
- list_for_each_entry(t, &pn->pktgen_threads, th_list)
- t->control |= (T_REMDEVALL);
-
- mutex_unlock(&pktgen_thread_lock);
+ pktgen_handle_all_threads(pn, T_REMDEVALL);
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));
@@ -3157,7 +3289,19 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC,
ktime_to_ns(elapsed));
- bps = pps * 8 * pkt_dev->cur_pkt_size;
+ if (pkt_dev->n_imix_entries > 0) {
+ int i;
+ struct imix_pkt *entry;
+
+ bps = 0;
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ entry = &pkt_dev->imix_entries[i];
+ bps += entry->size * entry->count_so_far;
+ }
+ bps = div64_u64(bps * 8 * NSEC_PER_SEC, ktime_to_ns(elapsed));
+ } else {
+ bps = pps * 8 * pkt_dev->cur_pkt_size;
+ }
mbps = bps;
do_div(mbps, 1000000);
@@ -3362,7 +3506,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* skb was 'freed' by stack, so clean few
* bits and reuse it
*/
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
} while (--burst > 0);
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
@@ -3430,7 +3574,7 @@ xmit_more:
net_info_ratelimited("%s xmit error: %d\n",
pkt_dev->odevname, ret);
pkt_dev->errors++;
- /* fall through */
+ fallthrough;
case NETDEV_TX_BUSY:
/* Retry it next time */
refcount_dec(&(pkt_dev->skb->users));
@@ -3459,12 +3603,11 @@ out:
static int pktgen_thread_worker(void *arg)
{
- DEFINE_WAIT(wait);
struct pktgen_thread *t = arg;
struct pktgen_dev *pkt_dev = NULL;
int cpu = t->cpu;
- BUG_ON(smp_processor_id() != cpu);
+ WARN_ON(smp_processor_id() != cpu);
init_waitqueue_head(&t->queue);
complete(&t->start_done);
@@ -3663,7 +3806,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
return add_dev_to_thread(t, pkt_dev);
out2:
- dev_put(pkt_dev->odev);
+ netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
out1:
#ifdef CONFIG_XFRM
free_SAs(pkt_dev);
@@ -3699,7 +3842,7 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
cpu_to_node(cpu),
"kpktgend_%d", cpu);
if (IS_ERR(p)) {
- pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
+ pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu);
list_del(&t->th_list);
kfree(t);
return PTR_ERR(p);
@@ -3757,7 +3900,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
/* Dis-associate from the interface */
if (pkt_dev->odev) {
- dev_put(pkt_dev->odev);
+ netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
pkt_dev->odev = NULL;
}
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index d964a5147f22..598041b0499e 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -103,10 +103,52 @@ static struct bpf_prog *ptp_insns __read_mostly;
unsigned int ptp_classify_raw(const struct sk_buff *skb)
{
- return BPF_PROG_RUN(ptp_insns, skb);
+ return bpf_prog_run(ptp_insns, skb);
}
EXPORT_SYMBOL_GPL(ptp_classify_raw);
+struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type)
+{
+ u8 *ptr = skb_mac_header(skb);
+
+ if (type & PTP_CLASS_VLAN)
+ ptr += VLAN_HLEN;
+
+ switch (type & PTP_CLASS_PMASK) {
+ case PTP_CLASS_IPV4:
+ ptr += IPV4_HLEN(ptr) + UDP_HLEN;
+ break;
+ case PTP_CLASS_IPV6:
+ ptr += IP6_HLEN + UDP_HLEN;
+ break;
+ case PTP_CLASS_L2:
+ break;
+ default:
+ return NULL;
+ }
+
+ ptr += ETH_HLEN;
+
+ /* Ensure that the entire header is present in this packet. */
+ if (ptr + sizeof(struct ptp_header) > skb->data + skb->len)
+ return NULL;
+
+ return (struct ptp_header *)ptr;
+}
+EXPORT_SYMBOL_GPL(ptp_parse_header);
+
+bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type)
+{
+ struct ptp_header *hdr;
+
+ hdr = ptp_parse_header(skb, type);
+ if (!hdr)
+ return false;
+
+ return ptp_get_msgtype(hdr, type) == PTP_MSGTYPE_SYNC;
+}
+EXPORT_SYMBOL_GPL(ptp_msg_is_sync);
+
void __init ptp_classifier_init(void)
{
static struct sock_filter ptp_filter[] __initdata = {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e1152f4ffe33..74864dc46a7e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -9,7 +9,7 @@
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Fixes:
- * Vitaly E. Lavrov RTA_OK arithmetics was wrong.
+ * Vitaly E. Lavrov RTA_OK arithmetic was wrong.
*/
#include <linux/bitops.h>
@@ -54,8 +54,10 @@
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#include "dev.h"
+
#define RTNL_MAX_TYPE 50
-#define RTNL_SLAVE_MAX_TYPE 36
+#define RTNL_SLAVE_MAX_TYPE 40
struct rtnl_link {
rtnl_doit_func doit;
@@ -95,6 +97,39 @@ void __rtnl_unlock(void)
defer_kfree_skb_list = NULL;
+ /* Ensure that we didn't actually add any TODO item when __rtnl_unlock()
+ * is used. In some places, e.g. in cfg80211, we have code that will do
+ * something like
+ * rtnl_lock()
+ * wiphy_lock()
+ * ...
+ * rtnl_unlock()
+ *
+ * and because netdev_run_todo() acquires the RTNL for items on the list
+ * we could cause a situation such as this:
+ * Thread 1 Thread 2
+ * rtnl_lock()
+ * unregister_netdevice()
+ * __rtnl_unlock()
+ * rtnl_lock()
+ * wiphy_lock()
+ * rtnl_unlock()
+ * netdev_run_todo()
+ * __rtnl_unlock()
+ *
+ * // list not empty now
+ * // because of thread 2
+ * rtnl_lock()
+ * while (!list_empty(...))
+ * rtnl_lock()
+ * wiphy_lock()
+ * **** DEADLOCK ****
+ *
+ * However, usage of __rtnl_unlock() is rare, and so we can ensure that
+ * it's not used in cases where something is added to do the list.
+ */
+ WARN_ON(!list_empty(&net_todo_list));
+
mutex_unlock(&rtnl_mutex);
while (head) {
@@ -139,7 +174,7 @@ bool lockdep_rtnl_is_held(void)
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */
-static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype)
{
@@ -157,7 +192,7 @@ static inline int rtm_msgindex(int msgtype)
static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
{
- struct rtnl_link **tab;
+ struct rtnl_link __rcu **tab;
if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
protocol = PF_UNSPEC;
@@ -166,7 +201,7 @@ static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
if (!tab)
tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);
- return tab[msgtype];
+ return rcu_dereference_rtnl(tab[msgtype]);
}
static int rtnl_register_internal(struct module *owner,
@@ -183,7 +218,7 @@ static int rtnl_register_internal(struct module *owner,
msgindex = rtm_msgindex(msgtype);
rtnl_lock();
- tab = rtnl_msg_handlers[protocol];
+ tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (tab == NULL) {
tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
if (!tab)
@@ -214,6 +249,8 @@ static int rtnl_register_internal(struct module *owner,
if (dumpit)
link->dumpit = dumpit;
+ WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL &&
+ (flags & RTNL_FLAG_BULK_DEL_SUPPORTED));
link->flags |= flags;
/* publish protocol:msgtype */
@@ -234,7 +271,7 @@ unlock:
* @msgtype: rtnetlink message type
* @doit: Function pointer called for each request message
* @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
*
* Like rtnl_register, but for use by removable modules.
*/
@@ -254,7 +291,7 @@ EXPORT_SYMBOL_GPL(rtnl_register_module);
* @msgtype: rtnetlink message type
* @doit: Function pointer called for each request message
* @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
+ * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions
*
* Registers the specified function pointers (at least one of them has
* to be non-NULL) to be called whenever a request message for the
@@ -286,7 +323,8 @@ void rtnl_register(int protocol, int msgtype,
*/
int rtnl_unregister(int protocol, int msgtype)
{
- struct rtnl_link **tab, *link;
+ struct rtnl_link __rcu **tab;
+ struct rtnl_link *link;
int msgindex;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
@@ -299,8 +337,8 @@ int rtnl_unregister(int protocol, int msgtype)
return -ENOENT;
}
- link = tab[msgindex];
- rcu_assign_pointer(tab[msgindex], NULL);
+ link = rtnl_dereference(tab[msgindex]);
+ RCU_INIT_POINTER(tab[msgindex], NULL);
rtnl_unlock();
kfree_rcu(link, rcu);
@@ -318,24 +356,25 @@ EXPORT_SYMBOL_GPL(rtnl_unregister);
*/
void rtnl_unregister_all(int protocol)
{
- struct rtnl_link **tab, *link;
+ struct rtnl_link __rcu **tab;
+ struct rtnl_link *link;
int msgindex;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
rtnl_lock();
- tab = rtnl_msg_handlers[protocol];
+ tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (!tab) {
rtnl_unlock();
return;
}
RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
- link = tab[msgindex];
+ link = rtnl_dereference(tab[msgindex]);
if (!link)
continue;
- rcu_assign_pointer(tab[msgindex], NULL);
+ RCU_INIT_POINTER(tab[msgindex], NULL);
kfree_rcu(link, rcu);
}
rtnl_unlock();
@@ -374,12 +413,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops)
if (rtnl_link_ops_get(ops->kind))
return -EEXIST;
- /* The check for setup is here because if ops
+ /* The check for alloc/setup is here because if ops
* does not have that filled up, it is not possible
* to use the ops for creating device. So do not
* fill up dellink as well. That disables rtnl_dellink.
*/
- if (ops->setup && !ops->dellink)
+ if ((ops->alloc || ops->setup) && !ops->dellink)
ops->dellink = unregister_netdevice_queue;
list_add_tail(&ops->list, &link_ops);
@@ -457,7 +496,7 @@ static void rtnl_lock_unregistering_all(void)
* setup_net() and cleanup_net() are not possible.
*/
for_each_net(net) {
- if (net->dev_unreg_count > 0) {
+ if (atomic_read(&net->dev_unreg_count) > 0) {
unregistering = true;
break;
}
@@ -541,7 +580,9 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
{
const struct rtnl_af_ops *ops;
- list_for_each_entry_rcu(ops, &rtnl_af_ops, list) {
+ ASSERT_RTNL();
+
+ list_for_each_entry(ops, &rtnl_af_ops, list) {
if (ops->family == family)
return ops;
}
@@ -706,15 +747,8 @@ out:
int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
{
struct sock *rtnl = net->rtnl;
- int err = 0;
- NETLINK_CB(skb).dst_group = group;
- if (echo)
- refcount_inc(&skb->users);
- netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
- if (echo)
- err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
- return err;
+ return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL);
}
int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
@@ -729,12 +763,8 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
struct nlmsghdr *nlh, gfp_t flags)
{
struct sock *rtnl = net->rtnl;
- int report = 0;
-
- if (nlh)
- report = nlmsg_report(nlh);
- nlmsg_notify(rtnl, skb, pid, group, report, flags);
+ nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags);
}
EXPORT_SYMBOL(rtnl_notify);
@@ -829,22 +859,27 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
switch (transition) {
case IF_OPER_UP:
if ((operstate == IF_OPER_DORMANT ||
+ operstate == IF_OPER_TESTING ||
operstate == IF_OPER_UNKNOWN) &&
- !netif_dormant(dev))
+ !netif_dormant(dev) && !netif_testing(dev))
operstate = IF_OPER_UP;
break;
+ case IF_OPER_TESTING:
+ if (netif_oper_up(dev))
+ operstate = IF_OPER_TESTING;
+ break;
+
case IF_OPER_DORMANT:
- if (operstate == IF_OPER_UP ||
- operstate == IF_OPER_UNKNOWN)
+ if (netif_oper_up(dev))
operstate = IF_OPER_DORMANT;
break;
}
if (dev->operstate != operstate) {
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
dev->operstate = operstate;
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
netdev_state_change(dev);
}
}
@@ -993,6 +1028,16 @@ static size_t rtnl_prop_list_size(const struct net_device *dev)
return size;
}
+static size_t rtnl_proto_down_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(1);
+
+ if (dev->proto_down_reason)
+ size += nla_total_size(0) + nla_total_size(4);
+
+ return size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
@@ -1012,10 +1057,14 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_MASTER */
+ nla_total_size(1) /* IFLA_CARRIER */
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ + nla_total_size(4) /* IFLA_ALLMULTI */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
+ nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -1034,7 +1083,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_EVENT */
+ nla_total_size(4) /* IFLA_NEW_NETNSID */
+ nla_total_size(4) /* IFLA_NEW_IFINDEX */
- + nla_total_size(1) /* IFLA_PROTO_DOWN */
+ + rtnl_proto_down_size(dev) /* proto down */
+ nla_total_size(4) /* IFLA_TARGET_NETNSID */
+ nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */
+ nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
@@ -1409,13 +1458,12 @@ static u32 rtnl_xdp_prog_skb(struct net_device *dev)
static u32 rtnl_xdp_prog_drv(struct net_device *dev)
{
- return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+ return dev_xdp_prog_id(dev, XDP_MODE_DRV);
}
static u32 rtnl_xdp_prog_hw(struct net_device *dev)
{
- return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
- XDP_QUERY_PROG_HW);
+ return dev_xdp_prog_id(dev, XDP_MODE_HW);
}
static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
@@ -1651,6 +1699,35 @@ nest_cancel:
return ret;
}
+static int rtnl_fill_proto_down(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *pr;
+ u32 preason;
+
+ if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
+ goto nla_put_failure;
+
+ preason = dev->proto_down_reason;
+ if (!preason)
+ return 0;
+
+ pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON);
+ if (!pr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) {
+ nla_nest_cancel(skb, pr);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, pr);
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb,
struct net_device *dev, struct net *src_net,
int type, u32 pid, u32 seq, u32 change,
@@ -1660,6 +1737,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
{
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
+ struct Qdisc *qdisc;
ASSERT_RTNL();
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
@@ -1677,6 +1755,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
goto nla_put_failure;
+ qdisc = rtnl_dereference(dev->qdisc);
if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
nla_put_u8(skb, IFLA_OPERSTATE,
@@ -1687,27 +1766,33 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
nla_put_u32(skb, IFLA_GROUP, dev->group) ||
nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
+ nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) ||
nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
+ nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) ||
#ifdef CONFIG_RPS
nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
#endif
put_master_ifindex(skb, dev) ||
nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
- (dev->qdisc &&
- nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
+ (qdisc &&
+ nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) ||
nla_put_ifalias(skb, dev) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
atomic_read(&dev->carrier_up_count) +
atomic_read(&dev->carrier_down_count)) ||
- nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) ||
nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
atomic_read(&dev->carrier_up_count)) ||
nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
atomic_read(&dev->carrier_down_count)))
goto nla_put_failure;
+ if (rtnl_fill_proto_down(skb, dev))
+ goto nla_put_failure;
+
if (event != IFLA_EVENT_NONE) {
if (nla_put_u32(skb, IFLA_EVENT, event))
goto nla_put_failure;
@@ -1770,6 +1855,16 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (rtnl_fill_prop_list(skb, dev))
goto nla_put_failure;
+ if (dev->dev.parent &&
+ nla_put_string(skb, IFLA_PARENT_DEV_NAME,
+ dev_name(dev->dev.parent)))
+ goto nla_put_failure;
+
+ if (dev->dev.parent && dev->dev.parent->bus &&
+ nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME,
+ dev->dev.parent->bus->name))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1827,6 +1922,13 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_ALT_IFNAME] = { .type = NLA_STRING,
.len = ALTIFNAMSIZ - 1 },
[IFLA_PERM_ADDRESS] = { .type = NLA_REJECT },
+ [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
+ [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
+ [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING },
+ [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT },
+ [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT },
+ [IFLA_ALLMULTI] = { .type = NLA_REJECT },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1872,7 +1974,9 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
};
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+ [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD },
[IFLA_XDP_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
[IFLA_XDP_FLAGS] = { .type = NLA_U32 },
[IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
@@ -1889,7 +1993,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla
if (linfo[IFLA_INFO_KIND]) {
char kind[MODULE_NAME_LEN];
- nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+ nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
ops = rtnl_link_ops_get(kind);
}
@@ -1904,6 +2008,13 @@ static bool link_master_filtered(struct net_device *dev, int master_idx)
return false;
master = netdev_master_upper_dev_get(dev);
+
+ /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need
+ * another invalid value for ifindex to denote "no master".
+ */
+ if (master_idx == -1)
+ return !!master;
+
if (!master || master->ifindex != master_idx)
return true;
@@ -2095,7 +2206,7 @@ out:
out_err:
cb->args[1] = idx;
cb->args[0] = h;
- cb->seq = net->dev_base_seq;
+ cb->seq = tgt_net->dev_base_seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
if (netnsid >= 0)
put_net(tgt_net);
@@ -2202,7 +2313,21 @@ invalid_attr:
return -EINVAL;
}
-static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
+ int max_tx_rate)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_set_vf_rate)
+ return -EOPNOTSUPP;
+ if (max_tx_rate && max_tx_rate < min_tx_rate)
+ return -EINVAL;
+
+ return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate);
+}
+
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
{
if (dev) {
if (tb[IFLA_ADDRESS] &&
@@ -2221,27 +2346,18 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
const struct rtnl_af_ops *af_ops;
- rcu_read_lock();
af_ops = rtnl_af_lookup(nla_type(af));
- if (!af_ops) {
- rcu_read_unlock();
+ if (!af_ops)
return -EAFNOSUPPORT;
- }
- if (!af_ops->set_link_af) {
- rcu_read_unlock();
+ if (!af_ops->set_link_af)
return -EOPNOTSUPP;
- }
if (af_ops->validate_link_af) {
- err = af_ops->validate_link_af(dev, af);
- if (err < 0) {
- rcu_read_unlock();
+ err = af_ops->validate_link_af(dev, af, extack);
+ if (err < 0)
return err;
- }
}
-
- rcu_read_unlock();
}
}
@@ -2339,11 +2455,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (err < 0)
return err;
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_rate)
- err = ops->ndo_set_vf_rate(dev, ivt->vf,
- ivf.min_tx_rate,
- ivt->rate);
+ err = rtnl_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate, ivt->rate);
if (err < 0)
return err;
}
@@ -2353,11 +2466,9 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (ivt->vf >= INT_MAX)
return -EINVAL;
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_rate)
- err = ops->ndo_set_vf_rate(dev, ivt->vf,
- ivt->min_tx_rate,
- ivt->max_tx_rate);
+
+ err = rtnl_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate, ivt->max_tx_rate);
if (err < 0)
return err;
}
@@ -2453,7 +2564,6 @@ static int do_set_master(struct net_device *dev, int ifindex,
err = ops->ndo_del_slave(upper_dev, dev);
if (err)
return err;
- netdev_update_lockdep_key(dev);
} else {
return -EOPNOTSUPP;
}
@@ -2475,30 +2585,105 @@ static int do_set_master(struct net_device *dev, int ifindex,
return 0;
}
+static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = {
+ [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 },
+ [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 },
+};
+
+static int do_set_proto_down(struct net_device *dev,
+ struct nlattr *nl_proto_down,
+ struct nlattr *nl_proto_down_reason,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1];
+ unsigned long mask = 0;
+ u32 value;
+ bool proto_down;
+ int err;
+
+ if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) {
+ NL_SET_ERR_MSG(extack, "Protodown not supported by device");
+ return -EOPNOTSUPP;
+ }
+
+ if (nl_proto_down_reason) {
+ err = nla_parse_nested_deprecated(pdreason,
+ IFLA_PROTO_DOWN_REASON_MAX,
+ nl_proto_down_reason,
+ ifla_proto_down_reason_policy,
+ NULL);
+ if (err < 0)
+ return err;
+
+ if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) {
+ NL_SET_ERR_MSG(extack, "Invalid protodown reason value");
+ return -EINVAL;
+ }
+
+ value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]);
+
+ if (pdreason[IFLA_PROTO_DOWN_REASON_MASK])
+ mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]);
+
+ dev_change_proto_down_reason(dev, mask, value);
+ }
+
+ if (nl_proto_down) {
+ proto_down = nla_get_u8(nl_proto_down);
+
+ /* Don't turn off protodown if there are active reasons */
+ if (!proto_down && dev->proto_down_reason) {
+ NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons");
+ return -EBUSY;
+ }
+ err = dev_change_proto_down(dev,
+ proto_down);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
#define DO_SETLINK_MODIFIED 0x01
/* notify flag means notify + modified. */
#define DO_SETLINK_NOTIFY 0x03
static int do_setlink(const struct sk_buff *skb,
struct net_device *dev, struct ifinfomsg *ifm,
struct netlink_ext_ack *extack,
- struct nlattr **tb, char *ifname, int status)
+ struct nlattr **tb, int status)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ char ifname[IFNAMSIZ];
int err;
- err = validate_linkmsg(dev, tb);
+ err = validate_linkmsg(dev, tb, extack);
if (err < 0)
return err;
+ if (tb[IFLA_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
+
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
- struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
- tb, CAP_NET_ADMIN);
+ const char *pat = ifname[0] ? ifname : NULL;
+ struct net *net;
+ int new_ifindex;
+
+ net = rtnl_link_get_net_capable(skb, dev_net(dev),
+ tb, CAP_NET_ADMIN);
if (IS_ERR(net)) {
err = PTR_ERR(net);
goto errout;
}
- err = dev_change_net_namespace(dev, net, ifname);
+ if (tb[IFLA_NEW_IFINDEX])
+ new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]);
+ else
+ new_ifindex = 0;
+
+ err = __dev_change_net_namespace(dev, net, pat, new_ifindex);
put_net(net);
if (err)
goto errout;
@@ -2548,7 +2733,7 @@ static int do_setlink(const struct sk_buff *skb,
sa->sa_family = dev->type;
memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
dev->addr_len);
- err = dev_set_mac_address(dev, sa, extack);
+ err = dev_set_mac_address_user(dev, sa, extack);
kfree(sa);
if (err)
goto errout;
@@ -2592,13 +2777,6 @@ static int do_setlink(const struct sk_buff *skb,
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
}
- if (ifm->ifi_flags || ifm->ifi_change) {
- err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
- extack);
- if (err < 0)
- goto errout;
- }
-
if (tb[IFLA_MASTER]) {
err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
if (err)
@@ -2606,6 +2784,13 @@ static int do_setlink(const struct sk_buff *skb,
status |= DO_SETLINK_MODIFIED;
}
+ if (ifm->ifi_flags || ifm->ifi_change) {
+ err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ extack);
+ if (err < 0)
+ goto errout;
+ }
+
if (tb[IFLA_CARRIER]) {
err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
if (err)
@@ -2625,7 +2810,7 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_GSO_MAX_SIZE]) {
u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
- if (max_size > GSO_MAX_SIZE) {
+ if (max_size > dev->tso_max_size) {
err = -EINVAL;
goto errout;
}
@@ -2639,13 +2824,22 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_GSO_MAX_SEGS]) {
u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
- if (max_segs > GSO_MAX_SEGS) {
+ if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) {
err = -EINVAL;
goto errout;
}
if (dev->gso_max_segs ^ max_segs) {
- dev->gso_max_segs = max_segs;
+ netif_set_gso_max_segs(dev, max_segs);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_GRO_MAX_SIZE]) {
+ u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]);
+
+ if (dev->gro_max_size ^ gro_max_size) {
+ netif_set_gro_max_size(dev, gro_max_size);
status |= DO_SETLINK_MODIFIED;
}
}
@@ -2656,11 +2850,11 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_LINKMODE]) {
unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
- write_lock_bh(&dev_base_lock);
+ write_lock(&dev_base_lock);
if (dev->link_mode ^ value)
status |= DO_SETLINK_NOTIFY;
dev->link_mode = value;
- write_unlock_bh(&dev_base_lock);
+ write_unlock(&dev_base_lock);
}
if (tb[IFLA_VFINFO_LIST]) {
@@ -2747,25 +2941,20 @@ static int do_setlink(const struct sk_buff *skb,
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
const struct rtnl_af_ops *af_ops;
- rcu_read_lock();
-
BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af))));
- err = af_ops->set_link_af(dev, af);
- if (err < 0) {
- rcu_read_unlock();
+ err = af_ops->set_link_af(dev, af, extack);
+ if (err < 0)
goto errout;
- }
- rcu_read_unlock();
status |= DO_SETLINK_NOTIFY;
}
}
err = 0;
- if (tb[IFLA_PROTO_DOWN]) {
- err = dev_change_proto_down(dev,
- nla_get_u8(tb[IFLA_PROTO_DOWN]));
+ if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) {
+ err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN],
+ tb[IFLA_PROTO_DOWN_REASON], extack);
if (err)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -2799,8 +2988,20 @@ static int do_setlink(const struct sk_buff *skb,
}
if (xdp[IFLA_XDP_FD]) {
+ int expected_fd = -1;
+
+ if (xdp_flags & XDP_FLAGS_REPLACE) {
+ if (!xdp[IFLA_XDP_EXPECTED_FD]) {
+ err = -EINVAL;
+ goto errout;
+ }
+ expected_fd =
+ nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]);
+ }
+
err = dev_change_xdp_fd(dev, extack,
nla_get_s32(xdp[IFLA_XDP_FD]),
+ expected_fd,
xdp_flags);
if (err)
goto errout;
@@ -2822,21 +3023,16 @@ errout:
}
static struct net_device *rtnl_dev_get(struct net *net,
- struct nlattr *ifname_attr,
- struct nlattr *altifname_attr,
- char *ifname)
-{
- char buffer[ALTIFNAMSIZ];
-
- if (!ifname) {
- ifname = buffer;
- if (ifname_attr)
- nla_strlcpy(ifname, ifname_attr, IFNAMSIZ);
- else if (altifname_attr)
- nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ);
- else
- return NULL;
- }
+ struct nlattr *tb[])
+{
+ char ifname[ALTIFNAMSIZ];
+
+ if (tb[IFLA_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else if (tb[IFLA_ALT_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ);
+ else
+ return NULL;
return __dev_get_by_name(net, ifname);
}
@@ -2849,7 +3045,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_device *dev;
int err;
struct nlattr *tb[IFLA_MAX+1];
- char ifname[IFNAMSIZ];
err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
ifla_policy, extack);
@@ -2860,17 +3055,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
- if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
- else
- ifname[0] = '\0';
-
err = -EINVAL;
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
- dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname);
+ dev = rtnl_dev_get(net, tb);
else
goto errout;
@@ -2879,7 +3069,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
+ err = do_setlink(skb, dev, ifm, extack, tb, 0);
errout:
return err;
}
@@ -2968,15 +3158,14 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
- dev = rtnl_dev_get(net, tb[IFLA_IFNAME],
- tb[IFLA_ALT_IFNAME], NULL);
+ dev = rtnl_dev_get(net, tb);
else if (tb[IFLA_GROUP])
err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
else
goto out;
if (!dev) {
- if (tb[IFLA_IFNAME] || ifm->ifi_index > 0)
+ if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0)
err = -ENODEV;
goto out;
@@ -3044,8 +3233,17 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
return ERR_PTR(-EINVAL);
}
- dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
- ops->setup, num_tx_queues, num_rx_queues);
+ if (ops->alloc) {
+ dev = ops->alloc(tb, ifname, name_assign_type,
+ num_tx_queues, num_rx_queues);
+ if (IS_ERR(dev))
+ return dev;
+ } else {
+ dev = alloc_netdev_mqs(ops->priv_size, ifname,
+ name_assign_type, ops->setup,
+ num_tx_queues, num_rx_queues);
+ }
+
if (!dev)
return ERR_PTR(-ENOMEM);
@@ -3065,8 +3263,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
dev->mtu = mtu;
}
if (tb[IFLA_ADDRESS]) {
- memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
- nla_len(tb[IFLA_ADDRESS]));
+ __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]),
+ nla_len(tb[IFLA_ADDRESS]));
dev->addr_assign_type = NET_ADDR_SET;
}
if (tb[IFLA_BROADCAST])
@@ -3083,7 +3281,9 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
if (tb[IFLA_GSO_MAX_SIZE])
netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
if (tb[IFLA_GSO_MAX_SEGS])
- dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+ netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS]));
+ if (tb[IFLA_GRO_MAX_SIZE])
+ netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE]));
return dev;
}
@@ -3100,7 +3300,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
for_each_netdev_safe(net, dev, aux) {
if (dev->group == group) {
- err = do_setlink(skb, dev, ifm, extack, tb, NULL, 0);
+ err = do_setlink(skb, dev, ifm, extack, tb, 0);
if (err < 0)
return err;
}
@@ -3109,24 +3309,118 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
return 0;
}
-static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct nlattr **attr, struct netlink_ext_ack *extack)
+static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
+ const struct rtnl_link_ops *ops,
+ struct nlattr **tb, struct nlattr **data,
+ struct netlink_ext_ack *extack)
{
- struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
unsigned char name_assign_type = NET_NAME_USER;
+ struct net *net = sock_net(skb->sk);
+ struct net *dest_net, *link_net;
+ struct net_device *dev;
+ char ifname[IFNAMSIZ];
+ int err;
+
+ if (!ops->alloc && !ops->setup)
+ return -EOPNOTSUPP;
+
+ if (tb[IFLA_IFNAME]) {
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ } else {
+ snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+ name_assign_type = NET_NAME_ENUM;
+ }
+
+ dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+
+ if (tb[IFLA_LINK_NETNSID]) {
+ int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
+
+ link_net = get_net_ns_by_id(dest_net, id);
+ if (!link_net) {
+ NL_SET_ERR_MSG(extack, "Unknown network namespace id");
+ err = -EINVAL;
+ goto out;
+ }
+ err = -EPERM;
+ if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
+ goto out;
+ } else {
+ link_net = NULL;
+ }
+
+ dev = rtnl_create_link(link_net ? : dest_net, ifname,
+ name_assign_type, ops, tb, extack);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out;
+ }
+
+ dev->ifindex = ifm->ifi_index;
+
+ if (ops->newlink)
+ err = ops->newlink(link_net ? : net, dev, tb, data, extack);
+ else
+ err = register_netdevice(dev);
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
+ }
+
+ err = rtnl_configure_link(dev, ifm);
+ if (err < 0)
+ goto out_unregister;
+ if (link_net) {
+ err = dev_change_net_namespace(dev, dest_net, ifname);
+ if (err < 0)
+ goto out_unregister;
+ }
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
+ if (err)
+ goto out_unregister;
+ }
+out:
+ if (link_net)
+ put_net(link_net);
+ put_net(dest_net);
+ return err;
+out_unregister:
+ if (ops->newlink) {
+ LIST_HEAD(list_kill);
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ } else {
+ unregister_netdevice(dev);
+ }
+ goto out;
+}
+
+struct rtnl_newlink_tbs {
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct nlattr *attr[RTNL_MAX_TYPE + 1];
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
+};
+
+static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct rtnl_newlink_tbs *tbs,
+ struct netlink_ext_ack *extack)
+{
struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
- const struct rtnl_link_ops *m_ops = NULL;
- struct net_device *master_dev = NULL;
+ struct nlattr ** const tb = tbs->tb;
+ const struct rtnl_link_ops *m_ops;
+ struct net_device *master_dev;
struct net *net = sock_net(skb->sk);
const struct rtnl_link_ops *ops;
- struct nlattr *tb[IFLA_MAX + 1];
- struct net *dest_net, *link_net;
struct nlattr **slave_data;
char kind[MODULE_NAME_LEN];
struct net_device *dev;
struct ifinfomsg *ifm;
- char ifname[IFNAMSIZ];
struct nlattr **data;
+ bool link_specified;
int err;
#ifdef CONFIG_MODULES
@@ -3141,26 +3435,27 @@ replay:
if (err < 0)
return err;
- if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
- else
- ifname[0] = '\0';
-
ifm = nlmsg_data(nlh);
- if (ifm->ifi_index > 0)
+ if (ifm->ifi_index > 0) {
+ link_specified = true;
dev = __dev_get_by_index(net, ifm->ifi_index);
- else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
- dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname);
- else
+ } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) {
+ link_specified = true;
+ dev = rtnl_dev_get(net, tb);
+ } else {
+ link_specified = false;
dev = NULL;
+ }
+ master_dev = NULL;
+ m_ops = NULL;
if (dev) {
master_dev = netdev_master_upper_dev_get(dev);
if (master_dev)
m_ops = master_dev->rtnl_link_ops;
}
- err = validate_linkmsg(dev, tb);
+ err = validate_linkmsg(dev, tb, extack);
if (err < 0)
return err;
@@ -3174,7 +3469,7 @@ replay:
memset(linkinfo, 0, sizeof(linkinfo));
if (linkinfo[IFLA_INFO_KIND]) {
- nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+ nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
ops = rtnl_link_ops_get(kind);
} else {
kind[0] = '\0';
@@ -3187,12 +3482,12 @@ replay:
return -EINVAL;
if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
- err = nla_parse_nested_deprecated(attr, ops->maxtype,
+ err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype,
linkinfo[IFLA_INFO_DATA],
ops->policy, extack);
if (err < 0)
return err;
- data = attr;
+ data = tbs->attr;
}
if (ops->validate) {
err = ops->validate(tb, data, extack);
@@ -3208,14 +3503,14 @@ replay:
if (m_ops->slave_maxtype &&
linkinfo[IFLA_INFO_SLAVE_DATA]) {
- err = nla_parse_nested_deprecated(slave_attr,
+ err = nla_parse_nested_deprecated(tbs->slave_attr,
m_ops->slave_maxtype,
linkinfo[IFLA_INFO_SLAVE_DATA],
m_ops->slave_policy,
extack);
if (err < 0)
return err;
- slave_data = slave_attr;
+ slave_data = tbs->slave_attr;
}
}
@@ -3249,11 +3544,16 @@ replay:
status |= DO_SETLINK_NOTIFY;
}
- return do_setlink(skb, dev, ifm, extack, tb, ifname, status);
+ return do_setlink(skb, dev, ifm, extack, tb, status);
}
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
- if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
+ /* No dev found and NLM_F_CREATE not set. Requested dev does not exist,
+ * or it's for a group
+ */
+ if (link_specified)
+ return -ENODEV;
+ if (tb[IFLA_GROUP])
return rtnl_group_changelink(skb, net,
nla_get_u32(tb[IFLA_GROUP]),
ifm, extack, tb);
@@ -3278,104 +3578,21 @@ replay:
return -EOPNOTSUPP;
}
- if (!ops->setup)
- return -EOPNOTSUPP;
-
- if (!ifname[0]) {
- snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
- name_assign_type = NET_NAME_ENUM;
- }
-
- dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
- if (IS_ERR(dest_net))
- return PTR_ERR(dest_net);
-
- if (tb[IFLA_LINK_NETNSID]) {
- int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
-
- link_net = get_net_ns_by_id(dest_net, id);
- if (!link_net) {
- NL_SET_ERR_MSG(extack, "Unknown network namespace id");
- err = -EINVAL;
- goto out;
- }
- err = -EPERM;
- if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
- goto out;
- } else {
- link_net = NULL;
- }
-
- dev = rtnl_create_link(link_net ? : dest_net, ifname,
- name_assign_type, ops, tb, extack);
- if (IS_ERR(dev)) {
- err = PTR_ERR(dev);
- goto out;
- }
-
- dev->ifindex = ifm->ifi_index;
-
- if (ops->newlink) {
- err = ops->newlink(link_net ? : net, dev, tb, data, extack);
- /* Drivers should call free_netdev() in ->destructor
- * and unregister it on failure after registration
- * so that device could be finally freed in rtnl_unlock.
- */
- if (err < 0) {
- /* If device is not registered at all, free it now */
- if (dev->reg_state == NETREG_UNINITIALIZED)
- free_netdev(dev);
- goto out;
- }
- } else {
- err = register_netdevice(dev);
- if (err < 0) {
- free_netdev(dev);
- goto out;
- }
- }
- err = rtnl_configure_link(dev, ifm);
- if (err < 0)
- goto out_unregister;
- if (link_net) {
- err = dev_change_net_namespace(dev, dest_net, ifname);
- if (err < 0)
- goto out_unregister;
- }
- if (tb[IFLA_MASTER]) {
- err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
- if (err)
- goto out_unregister;
- }
-out:
- if (link_net)
- put_net(link_net);
- put_net(dest_net);
- return err;
-out_unregister:
- if (ops->newlink) {
- LIST_HEAD(list_kill);
-
- ops->dellink(dev, &list_kill);
- unregister_netdevice_many(&list_kill);
- } else {
- unregister_netdevice(dev);
- }
- goto out;
+ return rtnl_newlink_create(skb, ifm, ops, tb, data, extack);
}
static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
- struct nlattr **attr;
+ struct rtnl_newlink_tbs *tbs;
int ret;
- attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL);
- if (!attr)
+ tbs = kmalloc(sizeof(*tbs), GFP_KERNEL);
+ if (!tbs)
return -ENOMEM;
- ret = __rtnl_newlink(skb, nlh, attr, extack);
- kfree(attr);
+ ret = __rtnl_newlink(skb, nlh, tbs, extack);
+ kfree(tbs);
return ret;
}
@@ -3463,8 +3680,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
- dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME],
- tb[IFLA_ALT_IFNAME], NULL);
+ dev = rtnl_dev_get(tgt_net, tb);
else
goto out;
@@ -3498,13 +3714,24 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
bool *changed, struct netlink_ext_ack *extack)
{
char *alt_ifname;
+ size_t size;
int err;
err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
if (err)
return err;
- alt_ifname = nla_strdup(attr, GFP_KERNEL);
+ if (cmd == RTM_NEWLINKPROP) {
+ size = rtnl_prop_list_size(dev);
+ size += nla_total_size(ALTIFNAMSIZ);
+ if (size >= U16_MAX) {
+ NL_SET_ERR_MSG(extack,
+ "effective property list too long");
+ return -EINVAL;
+ }
+ }
+
+ alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
if (!alt_ifname)
return -ENOMEM;
@@ -3548,8 +3775,7 @@ static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh,
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
- dev = rtnl_dev_get(net, tb[IFLA_IFNAME],
- tb[IFLA_ALT_IFNAME], NULL);
+ dev = rtnl_dev_get(net, tb);
else
return -EINVAL;
@@ -3586,13 +3812,13 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
}
-static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
- struct net_device *dev;
+ size_t min_ifinfo_dump_size = 0;
struct nlattr *tb[IFLA_MAX+1];
u32 ext_filter_mask = 0;
- u16 min_ifinfo_dump_size = 0;
+ struct net_device *dev;
int hdrlen;
/* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
@@ -3612,9 +3838,8 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
*/
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
- min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
- if_nlmsg_size(dev,
- ext_filter_mask));
+ min_ifinfo_dump_size = max(min_ifinfo_dump_size,
+ if_nlmsg_size(dev, ext_filter_mask));
}
rcu_read_unlock();
@@ -3632,7 +3857,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
s_idx = 1;
for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
- struct rtnl_link **tab;
+ struct rtnl_link __rcu **tab;
struct rtnl_link *link;
rtnl_dumpit_func dumpit;
@@ -3646,7 +3871,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
if (!tab)
continue;
- link = tab[type];
+ link = rcu_dereference_rtnl(tab[type]);
if (!link)
continue;
@@ -3676,9 +3901,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
- size_t if_info_size;
- skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags);
+ skb = nlmsg_new(if_nlmsg_size(dev, 0), flags);
if (skb == NULL)
goto errout;
@@ -3817,12 +4041,12 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
* implement its own handler for this.
*/
if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
- pr_info("%s: FDB only supports static addresses\n", dev->name);
+ netdev_info(dev, "default FDB implementation only supports local addresses\n");
return err;
}
if (vid) {
- pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name);
+ netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n");
return err;
}
@@ -3909,7 +4133,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
@@ -3956,7 +4180,7 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
* implement its own handler for this.
*/
if (!(ndm->ndm_state & NUD_PERMANENT)) {
- pr_info("%s: FDB only supports static addresses\n", dev->name);
+ netdev_info(dev, "default FDB implementation only supports local addresses\n");
return err;
}
@@ -3969,22 +4193,36 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
}
EXPORT_SYMBOL(ndo_dflt_fdb_del);
+static const struct nla_policy fdb_del_bulk_policy[NDA_MAX + 1] = {
+ [NDA_VLAN] = { .type = NLA_U16 },
+ [NDA_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
+ [NDA_NDM_STATE_MASK] = { .type = NLA_U16 },
+ [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 },
+};
+
static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
struct net *net = sock_net(skb->sk);
+ const struct net_device_ops *ops;
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
struct net_device *dev;
- int err = -EINVAL;
- __u8 *addr;
+ __u8 *addr = NULL;
+ int err;
u16 vid;
if (!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
- err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL,
- extack);
+ if (!del_bulk) {
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
+ NULL, extack);
+ } else {
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX,
+ fdb_del_bulk_policy, extack);
+ }
if (err < 0)
return err;
@@ -4000,9 +4238,12 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
return -ENODEV;
}
- if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
- NL_SET_ERR_MSG(extack, "invalid address");
- return -EINVAL;
+ if (!del_bulk) {
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "invalid address");
+ return -EINVAL;
+ }
+ addr = nla_data(tb[NDA_LLADDR]);
}
if (dev->type != ARPHRD_ETHER) {
@@ -4010,8 +4251,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
- addr = nla_data(tb[NDA_LLADDR]);
-
err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
if (err)
return err;
@@ -4020,12 +4259,18 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
- const struct net_device_ops *ops = br_dev->netdev_ops;
- if (ops->ndo_fdb_del)
- err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
+ ops = br_dev->netdev_ops;
+ if (!del_bulk) {
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
+ } else {
+ if (ops->ndo_fdb_del_bulk)
+ err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
+ extack);
+ }
if (err)
goto out;
@@ -4035,15 +4280,24 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Embedded bridge, macvlan, and any other device support */
if (ndm->ndm_flags & NTF_SELF) {
- if (dev->netdev_ops->ndo_fdb_del)
- err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
- vid);
- else
- err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
+ ops = dev->netdev_ops;
+ if (!del_bulk) {
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack);
+ else
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
+ } else {
+ /* in case err was cleared by NTF_MASTER call */
+ err = -EOPNOTSUPP;
+ if (ops->ndo_fdb_del_bulk)
+ err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid,
+ extack);
+ }
if (!err) {
- rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
- ndm->ndm_state);
+ if (!del_bulk)
+ rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
+ ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
}
}
@@ -4246,17 +4500,17 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (!br_idx) { /* user did not specify a specific bridge */
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (netif_is_bridge_port(dev)) {
br_dev = netdev_master_upper_dev_get(dev);
cops = br_dev->netdev_ops;
}
} else {
if (dev != br_dev &&
- !(dev->priv_flags & IFF_BRIDGE_PORT))
+ !netif_is_bridge_port(dev))
continue;
if (br_dev != netdev_master_upper_dev_get(dev) &&
- !(dev->priv_flags & IFF_EBRIDGE))
+ !netif_is_bridge_master(dev))
continue;
cops = ops;
}
@@ -4264,7 +4518,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (idx < s_idx)
goto cont;
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (netif_is_bridge_port(dev)) {
if (cops && cops->ndo_fdb_dump) {
err = cops->ndo_fdb_dump(skb, cb,
br_dev, dev,
@@ -4414,7 +4668,7 @@ static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (dev) {
if (!ndm_flags || (ndm_flags & NTF_MASTER)) {
- if (!(dev->priv_flags & IFF_BRIDGE_PORT)) {
+ if (!netif_is_bridge_port(dev)) {
NL_SET_ERR_MSG(extack, "Device is not a bridge port");
return -EINVAL;
}
@@ -4553,7 +4807,11 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
brport_nla_put_flag(skb, flags, mask,
IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
brport_nla_put_flag(skb, flags, mask,
- IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
+ IFLA_BRPORT_PROXYARP, BR_PROXYARP) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) {
nla_nest_cancel(skb, protinfo);
goto nla_put_failure;
}
@@ -4716,6 +4974,10 @@ static int rtnl_bridge_notify(struct net_device *dev)
if (err < 0)
goto errout;
+ /* Notification info is only filled for bridge ports, not the bridge
+ * device itself. Therefore, a zero notification length is valid and
+ * should not result in an error.
+ */
if (!skb->len)
goto errout;
@@ -4888,82 +5150,259 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
(!idxattr || idxattr == attrid);
}
-#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
-static int rtnl_get_offload_stats_attr_size(int attr_id)
+static bool
+rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id)
{
- switch (attr_id) {
- case IFLA_OFFLOAD_XSTATS_CPU_HIT:
- return sizeof(struct rtnl_link_stats64);
- }
+ return dev->netdev_ops &&
+ dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats &&
+ dev->netdev_ops->ndo_has_offload_stats(dev, attr_id);
+}
- return 0;
+static unsigned int
+rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id)
+{
+ return rtnl_offload_xstats_have_ndo(dev, attr_id) ?
+ sizeof(struct rtnl_link_stats64) : 0;
}
-static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
- int *prividx)
+static int
+rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id,
+ struct sk_buff *skb)
{
+ unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id);
struct nlattr *attr = NULL;
- int attr_id, size;
void *attr_data;
int err;
- if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
- dev->netdev_ops->ndo_get_offload_stats))
+ if (!size)
return -ENODATA;
- for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
- attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
- if (attr_id < *prividx)
- continue;
+ attr = nla_reserve_64bit(skb, attr_id, size,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ return -EMSGSIZE;
- size = rtnl_get_offload_stats_attr_size(attr_id);
- if (!size)
- continue;
+ attr_data = nla_data(attr);
+ memset(attr_data, 0, size);
- if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
- continue;
+ err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static unsigned int
+rtnl_offload_xstats_get_size_stats(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ bool enabled = netdev_offload_xstats_enabled(dev, type);
+
+ return enabled ? sizeof(struct rtnl_hw_stats64) : 0;
+}
+
+struct rtnl_offload_xstats_request_used {
+ bool request;
+ bool used;
+};
+
+static int
+rtnl_offload_xstats_get_stats(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_offload_xstats_request_used *ru,
+ struct rtnl_hw_stats64 *stats,
+ struct netlink_ext_ack *extack)
+{
+ bool request;
+ bool used;
+ int err;
+
+ request = netdev_offload_xstats_enabled(dev, type);
+ if (!request) {
+ used = false;
+ goto out;
+ }
+
+ err = netdev_offload_xstats_get(dev, type, stats, &used, extack);
+ if (err)
+ return err;
+
+out:
+ if (ru) {
+ ru->request = request;
+ ru->used = used;
+ }
+ return 0;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id,
+ struct rtnl_offload_xstats_request_used *ru)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, attr_id);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ struct rtnl_offload_xstats_request_used ru_l3;
+ struct nlattr *nest;
+ int err;
+
+ err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack);
+ if (err)
+ return err;
+
+ nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (rtnl_offload_xstats_fill_hw_s_info_one(skb,
+ IFLA_OFFLOAD_XSTATS_L3_STATS,
+ &ru_l3))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
+ int *prividx, u32 off_filter_mask,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO;
+ int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS;
+ int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
+ bool have_data = false;
+ int err;
+
+ if (*prividx <= attr_id_cpu_hit &&
+ (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) {
+ err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb);
+ if (!err) {
+ have_data = true;
+ } else if (err != -ENODATA) {
+ *prividx = attr_id_cpu_hit;
+ return err;
+ }
+ }
+
+ if (*prividx <= attr_id_hw_s_info &&
+ (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) {
+ *prividx = attr_id_hw_s_info;
+
+ err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack);
+ if (err)
+ return err;
+
+ have_data = true;
+ *prividx = 0;
+ }
+
+ if (*prividx <= attr_id_l3_stats &&
+ (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) {
+ unsigned int size_l3;
+ struct nlattr *attr;
+
+ *prividx = attr_id_l3_stats;
- attr = nla_reserve_64bit(skb, attr_id, size,
+ size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3);
+ if (!size_l3)
+ goto skip_l3_stats;
+ attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3,
IFLA_OFFLOAD_XSTATS_UNSPEC);
if (!attr)
- goto nla_put_failure;
+ return -EMSGSIZE;
- attr_data = nla_data(attr);
- memset(attr_data, 0, size);
- err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
- attr_data);
+ err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL,
+ nla_data(attr), extack);
if (err)
- goto get_offload_stats_failure;
+ return err;
+
+ have_data = true;
+skip_l3_stats:
+ *prividx = 0;
}
- if (!attr)
+ if (!have_data)
return -ENODATA;
*prividx = 0;
return 0;
+}
-nla_put_failure:
- err = -EMSGSIZE;
-get_offload_stats_failure:
- *prividx = attr_id;
- return err;
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ bool enabled = netdev_offload_xstats_enabled(dev, type);
+
+ return nla_total_size(0) +
+ /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */
+ nla_total_size(sizeof(u8)) +
+ /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */
+ (enabled ? nla_total_size(sizeof(u8)) : 0) +
+ 0;
}
-static int rtnl_get_offload_stats_size(const struct net_device *dev)
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev)
{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+
+ return nla_total_size(0) +
+ /* IFLA_OFFLOAD_XSTATS_L3_STATS */
+ rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) +
+ 0;
+}
+
+static int rtnl_offload_xstats_get_size(const struct net_device *dev,
+ u32 off_filter_mask)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
int nla_size = 0;
- int attr_id;
int size;
- if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
- dev->netdev_ops->ndo_get_offload_stats))
- return 0;
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) {
+ size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit);
+ nla_size += nla_total_size_64bit(size);
+ }
- for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
- attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
- if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
- continue;
- size = rtnl_get_offload_stats_attr_size(attr_id);
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO))
+ nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev);
+
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) {
+ size = rtnl_offload_xstats_get_size_stats(dev, t_l3);
nla_size += nla_total_size_64bit(size);
}
@@ -4973,11 +5412,21 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev)
return nla_size;
}
+struct rtnl_stats_dump_filters {
+ /* mask[0] filters outer attributes. Then individual nests have their
+ * filtering mask at the index of the nested attribute.
+ */
+ u32 mask[IFLA_STATS_MAX + 1];
+};
+
static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
- unsigned int flags, unsigned int filter_mask,
- int *idxattr, int *prividx)
+ unsigned int flags,
+ const struct rtnl_stats_dump_filters *filters,
+ int *idxattr, int *prividx,
+ struct netlink_ext_ack *extack)
{
+ unsigned int filter_mask = filters->mask[0];
struct if_stats_msg *ifsm;
struct nlmsghdr *nlh;
struct nlattr *attr;
@@ -5003,8 +5452,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64,
sizeof(struct rtnl_link_stats64),
IFLA_STATS_UNSPEC);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
sp = nla_data(attr);
dev_get_stats(dev, sp);
@@ -5017,8 +5468,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
*idxattr = IFLA_STATS_LINK_XSTATS;
attr = nla_nest_start_noflag(skb,
IFLA_STATS_LINK_XSTATS);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
nla_nest_end(skb, attr);
@@ -5040,8 +5493,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
attr = nla_nest_start_noflag(skb,
IFLA_STATS_LINK_XSTATS_SLAVE);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
nla_nest_end(skb, attr);
@@ -5053,13 +5508,19 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
*idxattr)) {
+ u32 off_filter_mask;
+
+ off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
attr = nla_nest_start_noflag(skb,
IFLA_STATS_LINK_OFFLOAD_XSTATS);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
- err = rtnl_get_offload_stats(skb, dev, prividx);
+ err = rtnl_offload_xstats_fill(skb, dev, prividx,
+ off_filter_mask, extack);
if (err == -ENODATA)
nla_nest_cancel(skb, attr);
else
@@ -5075,19 +5536,21 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
*idxattr = IFLA_STATS_AF_SPEC;
attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
rcu_read_lock();
list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
if (af_ops->fill_stats_af) {
struct nlattr *af;
- int err;
af = nla_nest_start_noflag(skb,
af_ops->family);
if (!af) {
rcu_read_unlock();
+ err = -EMSGSIZE;
goto nla_put_failure;
}
err = af_ops->fill_stats_af(skb, dev);
@@ -5120,13 +5583,14 @@ nla_put_failure:
else
nlmsg_end(skb, nlh);
- return -EMSGSIZE;
+ return err;
}
static size_t if_nlmsg_stats_size(const struct net_device *dev,
- u32 filter_mask)
+ const struct rtnl_stats_dump_filters *filters)
{
- size_t size = 0;
+ size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg));
+ unsigned int filter_mask = filters->mask[0];
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
@@ -5162,8 +5626,12 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
}
}
- if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
- size += rtnl_get_offload_stats_size(dev);
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) {
+ u32 off_filter_mask;
+
+ off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
+ size += rtnl_offload_xstats_get_size(dev, off_filter_mask);
+ }
if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
struct rtnl_af_ops *af_ops;
@@ -5187,6 +5655,79 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
return size;
}
+#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1)
+
+static const struct nla_policy
+rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = {
+ [IFLA_STATS_LINK_OFFLOAD_XSTATS] =
+ NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID),
+};
+
+static const struct nla_policy
+rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = {
+ [IFLA_STATS_GET_FILTERS] =
+ NLA_POLICY_NESTED(rtnl_stats_get_policy_filters),
+};
+
+static const struct nla_policy
+ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = {
+ [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters,
+ struct rtnl_stats_dump_filters *filters,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_STATS_MAX + 1];
+ int err;
+ int at;
+
+ err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters,
+ rtnl_stats_get_policy_filters, extack);
+ if (err < 0)
+ return err;
+
+ for (at = 1; at <= IFLA_STATS_MAX; at++) {
+ if (tb[at]) {
+ if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) {
+ NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask");
+ return -EINVAL;
+ }
+ filters->mask[at] = nla_get_u32(tb[at]);
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_stats_get_parse(const struct nlmsghdr *nlh,
+ u32 filter_mask,
+ struct rtnl_stats_dump_filters *filters,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+ int err;
+ int i;
+
+ filters->mask[0] = filter_mask;
+ for (i = 1; i < ARRAY_SIZE(filters->mask); i++)
+ filters->mask[i] = -1U;
+
+ err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb,
+ IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_STATS_GET_FILTERS]) {
+ err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS],
+ filters, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
bool is_dump, struct netlink_ext_ack *extack)
{
@@ -5209,10 +5750,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request");
return -EINVAL;
}
- if (nlmsg_attrlen(nlh, sizeof(*ifsm))) {
- NL_SET_ERR_MSG(extack, "Invalid attributes after stats header");
- return -EINVAL;
- }
if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) {
NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask");
return -EINVAL;
@@ -5224,12 +5761,12 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct rtnl_stats_dump_filters filters;
struct net *net = sock_net(skb->sk);
struct net_device *dev = NULL;
int idxattr = 0, prividx = 0;
struct if_stats_msg *ifsm;
struct sk_buff *nskb;
- u32 filter_mask;
int err;
err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
@@ -5246,17 +5783,22 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!dev)
return -ENODEV;
- filter_mask = ifsm->filter_mask;
- if (!filter_mask)
+ if (!ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get");
return -EINVAL;
+ }
- nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL);
+ err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack);
+ if (err)
+ return err;
+
+ nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL);
if (!nskb)
return -ENOBUFS;
err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
- 0, filter_mask, &idxattr, &prividx);
+ 0, &filters, &idxattr, &prividx, extack);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
WARN_ON(err == -EMSGSIZE);
@@ -5272,12 +5814,12 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netlink_ext_ack *extack = cb->extack;
int h, s_h, err, s_idx, s_idxattr, s_prividx;
+ struct rtnl_stats_dump_filters filters;
struct net *net = sock_net(skb->sk);
unsigned int flags = NLM_F_MULTI;
struct if_stats_msg *ifsm;
struct hlist_head *head;
struct net_device *dev;
- u32 filter_mask = 0;
int idx = 0;
s_h = cb->args[0];
@@ -5292,12 +5834,16 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
return err;
ifsm = nlmsg_data(cb->nlh);
- filter_mask = ifsm->filter_mask;
- if (!filter_mask) {
+ if (!ifsm->filter_mask) {
NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump");
return -EINVAL;
}
+ err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters,
+ extack);
+ if (err)
+ return err;
+
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
head = &net->dev_index_head[h];
@@ -5307,8 +5853,9 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, 0,
- flags, filter_mask,
- &s_idxattr, &s_prividx);
+ flags, &filters,
+ &s_idxattr, &s_prividx,
+ extack);
/* If we ran out of room on the first message,
* we're in trouble
*/
@@ -5332,6 +5879,107 @@ out:
return skb->len;
}
+void rtnl_offload_xstats_notify(struct net_device *dev)
+{
+ struct rtnl_stats_dump_filters response_filters = {};
+ struct net *net = dev_net(dev);
+ int idxattr = 0, prividx = 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ ASSERT_RTNL();
+
+ response_filters.mask[0] |=
+ IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+
+ skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters),
+ GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0,
+ &response_filters, &idxattr, &prividx, NULL);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_STATS, err);
+}
+EXPORT_SYMBOL(rtnl_offload_xstats_notify);
+
+static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ struct rtnl_stats_dump_filters response_filters = {};
+ struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev = NULL;
+ struct if_stats_msg *ifsm;
+ bool notify = false;
+ int err;
+
+ err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
+ false, extack);
+ if (err)
+ return err;
+
+ ifsm = nlmsg_data(nlh);
+ if (ifsm->family != AF_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC");
+ return -EINVAL;
+ }
+
+ if (ifsm->ifindex > 0)
+ dev = __dev_get_by_index(net, ifsm->ifindex);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX,
+ ifla_stats_set_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) {
+ u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]);
+
+ if (req)
+ err = netdev_offload_xstats_enable(dev, t_l3, extack);
+ else
+ err = netdev_offload_xstats_disable(dev, t_l3);
+
+ if (!err)
+ notify = true;
+ else if (err != -EALREADY)
+ return err;
+
+ response_filters.mask[0] |=
+ IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+ }
+
+ if (notify)
+ rtnl_offload_xstats_notify(dev);
+
+ return 0;
+}
+
/* Process one rtnetlink message. */
static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -5339,11 +5987,11 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct rtnl_link *link;
+ enum rtnl_kinds kind;
struct module *owner;
int err = -EOPNOTSUPP;
rtnl_doit_func doit;
unsigned int flags;
- int kind;
int family;
int type;
@@ -5358,16 +6006,16 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
- kind = type&3;
+ kind = rtnl_msgtype_kind(type);
- if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
+ if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN))
return -EPERM;
rcu_read_lock();
- if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+ if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) {
struct sock *rtnl;
rtnl_dumpit_func dumpit;
- u16 min_dump_alloc = 0;
+ u32 min_dump_alloc = 0;
link = rtnl_get_link(family, type);
if (!link || !link->dumpit) {
@@ -5420,6 +6068,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
flags = link->flags;
+ if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) &&
+ !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) {
+ NL_SET_ERR_MSG(extack, "Bulk delete is not supported");
+ module_put(owner);
+ goto err_unlock;
+ }
+
if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
doit = link->doit;
rcu_read_unlock();
@@ -5548,7 +6203,8 @@ void __init rtnetlink_init(void)
rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0);
rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0);
+ rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL,
+ RTNL_FLAG_BULK_DEL_SUPPORTED);
rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0);
rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
@@ -5557,4 +6213,5 @@ void __init rtnetlink_init(void)
rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
0);
+ rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0);
}
diff --git a/net/core/scm.c b/net/core/scm.c
index dc6fed1f221c..5c356f0dee30 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -79,7 +79,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
if (!fpl)
{
- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT);
if (!fpl)
return -ENOMEM;
*fplp = fpl;
@@ -212,16 +212,12 @@ EXPORT_SYMBOL(__scm_send);
int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
{
- struct cmsghdr __user *cm
- = (__force struct cmsghdr __user *)msg->msg_control;
- struct cmsghdr cmhdr;
int cmlen = CMSG_LEN(len);
- int err;
- if (MSG_CMSG_COMPAT & msg->msg_flags)
+ if (msg->msg_flags & MSG_CMSG_COMPAT)
return put_cmsg_compat(msg, level, type, len, data);
- if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+ if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) {
msg->msg_flags |= MSG_CTRUNC;
return 0; /* XXX: return error? check spec. */
}
@@ -229,23 +225,37 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
msg->msg_flags |= MSG_CTRUNC;
cmlen = msg->msg_controllen;
}
- cmhdr.cmsg_level = level;
- cmhdr.cmsg_type = type;
- cmhdr.cmsg_len = cmlen;
-
- err = -EFAULT;
- if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
- goto out;
- if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
- goto out;
- cmlen = CMSG_SPACE(len);
- if (msg->msg_controllen < cmlen)
- cmlen = msg->msg_controllen;
+
+ if (msg->msg_control_is_user) {
+ struct cmsghdr __user *cm = msg->msg_control_user;
+
+ if (!user_write_access_begin(cm, cmlen))
+ goto efault;
+
+ unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);
+ unsafe_put_user(level, &cm->cmsg_level, efault_end);
+ unsafe_put_user(type, &cm->cmsg_type, efault_end);
+ unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
+ cmlen - sizeof(*cm), efault_end);
+ user_write_access_end();
+ } else {
+ struct cmsghdr *cm = msg->msg_control;
+
+ cm->cmsg_level = level;
+ cm->cmsg_type = type;
+ cm->cmsg_len = cmlen;
+ memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm));
+ }
+
+ cmlen = min(CMSG_SPACE(len), msg->msg_controllen);
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
- err = 0;
-out:
- return err;
+ return 0;
+
+efault_end:
+ user_write_access_end();
+efault:
+ return -EFAULT;
}
EXPORT_SYMBOL(put_cmsg);
@@ -277,78 +287,60 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter
}
EXPORT_SYMBOL(put_cmsg_scm_timestamping);
-void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+static int scm_max_fds(struct msghdr *msg)
{
- struct cmsghdr __user *cm
- = (__force struct cmsghdr __user*)msg->msg_control;
+ if (msg->msg_controllen <= sizeof(struct cmsghdr))
+ return 0;
+ return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int);
+}
- int fdmax = 0;
- int fdnum = scm->fp->count;
- struct file **fp = scm->fp->fp;
- int __user *cmfptr;
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct cmsghdr __user *cm =
+ (__force struct cmsghdr __user *)msg->msg_control;
+ unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+ int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count);
+ int __user *cmsg_data = CMSG_USER_DATA(cm);
int err = 0, i;
- if (MSG_CMSG_COMPAT & msg->msg_flags) {
+ /* no use for FD passing from kernel space callers */
+ if (WARN_ON_ONCE(!msg->msg_control_is_user))
+ return;
+
+ if (msg->msg_flags & MSG_CMSG_COMPAT) {
scm_detach_fds_compat(msg, scm);
return;
}
- if (msg->msg_controllen > sizeof(struct cmsghdr))
- fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
- / sizeof(int));
-
- if (fdnum < fdmax)
- fdmax = fdnum;
-
- for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
- i++, cmfptr++)
- {
- struct socket *sock;
- int new_fd;
- err = security_file_receive(fp[i]);
- if (err)
- break;
- err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
- ? O_CLOEXEC : 0);
+ for (i = 0; i < fdmax; i++) {
+ err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags);
if (err < 0)
break;
- new_fd = err;
- err = put_user(new_fd, cmfptr);
- if (err) {
- put_unused_fd(new_fd);
- break;
- }
- /* Bump the usage count and install the file. */
- sock = sock_from_file(fp[i], &err);
- if (sock) {
- sock_update_netprioidx(&sock->sk->sk_cgrp_data);
- sock_update_classid(&sock->sk->sk_cgrp_data);
- }
- fd_install(new_fd, get_file(fp[i]));
}
- if (i > 0)
- {
- int cmlen = CMSG_LEN(i*sizeof(int));
+ if (i > 0) {
+ int cmlen = CMSG_LEN(i * sizeof(int));
+
err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
if (!err)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
- cmlen = CMSG_SPACE(i*sizeof(int));
+ cmlen = CMSG_SPACE(i * sizeof(int));
if (msg->msg_controllen < cmlen)
cmlen = msg->msg_controllen;
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
}
}
- if (i < fdnum || (fdnum && fdmax <= 0))
+
+ if (i < scm->fp->count || (scm->fp->count && fdmax <= 0))
msg->msg_flags |= MSG_CTRUNC;
/*
- * All of the files that fit in the message have had their
- * usage counts incremented, so we just free the list.
+ * All of the files that fit in the message have had their usage counts
+ * incremented, so we just free the list.
*/
__scm_destroy(scm);
}
@@ -363,7 +355,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
return NULL;
new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 7b6b1d2c3d10..b0ff6153be62 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -5,7 +5,6 @@
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/cryptohash.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/random.h>
@@ -20,8 +19,10 @@
#include <linux/in6.h>
#include <net/tcp.h>
-static siphash_key_t net_secret __read_mostly;
-static siphash_key_t ts_secret __read_mostly;
+static siphash_aligned_key_t net_secret;
+static siphash_aligned_key_t ts_secret;
+
+#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ)
static __always_inline void net_secret_init(void)
{
@@ -63,7 +64,7 @@ u32 secure_tcpv6_ts_off(const struct net *net,
.daddr = *(struct in6_addr *)daddr,
};
- if (net->ipv4.sysctl_tcp_timestamps != 1)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
return 0;
ts_secret_init();
@@ -95,17 +96,19 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
}
EXPORT_SYMBOL(secure_tcpv6_seq);
-u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport)
{
const struct {
struct in6_addr saddr;
struct in6_addr daddr;
+ unsigned int timeseed;
__be16 dport;
} __aligned(SIPHASH_ALIGNMENT) combined = {
.saddr = *(struct in6_addr *)saddr,
.daddr = *(struct in6_addr *)daddr,
- .dport = dport
+ .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ .dport = dport,
};
net_secret_init();
return siphash(&combined, offsetofend(typeof(combined), dport),
@@ -117,7 +120,7 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#ifdef CONFIG_INET
u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
{
- if (net->ipv4.sysctl_tcp_timestamps != 1)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
return 0;
ts_secret_init();
@@ -143,11 +146,13 @@ u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
}
EXPORT_SYMBOL_GPL(secure_tcp_seq);
-u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
net_secret_init();
- return siphash_3u32((__force u32)saddr, (__force u32)daddr,
- (__force u16)dport, &net_secret);
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+ (__force u16)dport,
+ jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ &net_secret);
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
diff --git a/net/core/selftests.c b/net/core/selftests.c
new file mode 100644
index 000000000000..acb1ee97bbd3
--- /dev/null
+++ b/net/core/selftests.c
@@ -0,0 +1,412 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates.
+ * stmmac Selftests Support
+ *
+ * Author: Jose Abreu <joabreu@synopsys.com>
+ *
+ * Ported from stmmac by:
+ * Copyright (C) 2021 Oleksij Rempel <o.rempel@pengutronix.de>
+ */
+
+#include <linux/phy.h>
+#include <net/selftests.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+struct net_packet_attrs {
+ const unsigned char *src;
+ const unsigned char *dst;
+ u32 ip_src;
+ u32 ip_dst;
+ bool tcp;
+ u16 sport;
+ u16 dport;
+ int timeout;
+ int size;
+ int max_size;
+ u8 id;
+ u16 queue_mapping;
+};
+
+struct net_test_priv {
+ struct net_packet_attrs *packet;
+ struct packet_type pt;
+ struct completion comp;
+ int double_vlan;
+ int vlan_id;
+ int ok;
+};
+
+struct netsfhdr {
+ __be32 version;
+ __be64 magic;
+ u8 id;
+} __packed;
+
+static u8 net_test_next_id;
+
+#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
+ sizeof(struct netsfhdr))
+#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL
+#define NET_LB_TIMEOUT msecs_to_jiffies(200)
+
+static struct sk_buff *net_test_get_skb(struct net_device *ndev,
+ struct net_packet_attrs *attr)
+{
+ struct sk_buff *skb = NULL;
+ struct udphdr *uhdr = NULL;
+ struct tcphdr *thdr = NULL;
+ struct netsfhdr *shdr;
+ struct ethhdr *ehdr;
+ struct iphdr *ihdr;
+ int iplen, size;
+
+ size = attr->size + NET_TEST_PKT_SIZE;
+
+ if (attr->tcp)
+ size += sizeof(struct tcphdr);
+ else
+ size += sizeof(struct udphdr);
+
+ if (attr->max_size && attr->max_size > size)
+ size = attr->max_size;
+
+ skb = netdev_alloc_skb(ndev, size);
+ if (!skb)
+ return NULL;
+
+ prefetchw(skb->data);
+
+ ehdr = skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+
+ skb_set_network_header(skb, skb->len);
+ ihdr = skb_put(skb, sizeof(*ihdr));
+
+ skb_set_transport_header(skb, skb->len);
+ if (attr->tcp)
+ thdr = skb_put(skb, sizeof(*thdr));
+ else
+ uhdr = skb_put(skb, sizeof(*uhdr));
+
+ eth_zero_addr(ehdr->h_dest);
+
+ if (attr->src)
+ ether_addr_copy(ehdr->h_source, attr->src);
+ if (attr->dst)
+ ether_addr_copy(ehdr->h_dest, attr->dst);
+
+ ehdr->h_proto = htons(ETH_P_IP);
+
+ if (attr->tcp) {
+ thdr->source = htons(attr->sport);
+ thdr->dest = htons(attr->dport);
+ thdr->doff = sizeof(struct tcphdr) / 4;
+ thdr->check = 0;
+ } else {
+ uhdr->source = htons(attr->sport);
+ uhdr->dest = htons(attr->dport);
+ uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size);
+ if (attr->max_size)
+ uhdr->len = htons(attr->max_size -
+ (sizeof(*ihdr) + sizeof(*ehdr)));
+ uhdr->check = 0;
+ }
+
+ ihdr->ihl = 5;
+ ihdr->ttl = 32;
+ ihdr->version = 4;
+ if (attr->tcp)
+ ihdr->protocol = IPPROTO_TCP;
+ else
+ ihdr->protocol = IPPROTO_UDP;
+ iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size;
+ if (attr->tcp)
+ iplen += sizeof(*thdr);
+ else
+ iplen += sizeof(*uhdr);
+
+ if (attr->max_size)
+ iplen = attr->max_size - sizeof(*ehdr);
+
+ ihdr->tot_len = htons(iplen);
+ ihdr->frag_off = 0;
+ ihdr->saddr = htonl(attr->ip_src);
+ ihdr->daddr = htonl(attr->ip_dst);
+ ihdr->tos = 0;
+ ihdr->id = 0;
+ ip_send_check(ihdr);
+
+ shdr = skb_put(skb, sizeof(*shdr));
+ shdr->version = 0;
+ shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC);
+ attr->id = net_test_next_id;
+ shdr->id = net_test_next_id++;
+
+ if (attr->size)
+ skb_put(skb, attr->size);
+ if (attr->max_size && attr->max_size > skb->len)
+ skb_put(skb, attr->max_size - skb->len);
+
+ skb->csum = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ if (attr->tcp) {
+ thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr,
+ ihdr->daddr, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+ } else {
+ udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr);
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb->pkt_type = PACKET_HOST;
+ skb->dev = ndev;
+
+ return skb;
+}
+
+static int net_test_loopback_validate(struct sk_buff *skb,
+ struct net_device *ndev,
+ struct packet_type *pt,
+ struct net_device *orig_ndev)
+{
+ struct net_test_priv *tpriv = pt->af_packet_priv;
+ const unsigned char *src = tpriv->packet->src;
+ const unsigned char *dst = tpriv->packet->dst;
+ struct netsfhdr *shdr;
+ struct ethhdr *ehdr;
+ struct udphdr *uhdr;
+ struct tcphdr *thdr;
+ struct iphdr *ihdr;
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+
+ if (skb_linearize(skb))
+ goto out;
+ if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN))
+ goto out;
+
+ ehdr = (struct ethhdr *)skb_mac_header(skb);
+ if (dst) {
+ if (!ether_addr_equal_unaligned(ehdr->h_dest, dst))
+ goto out;
+ }
+
+ if (src) {
+ if (!ether_addr_equal_unaligned(ehdr->h_source, src))
+ goto out;
+ }
+
+ ihdr = ip_hdr(skb);
+ if (tpriv->double_vlan)
+ ihdr = (struct iphdr *)(skb_network_header(skb) + 4);
+
+ if (tpriv->packet->tcp) {
+ if (ihdr->protocol != IPPROTO_TCP)
+ goto out;
+
+ thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl);
+ if (thdr->dest != htons(tpriv->packet->dport))
+ goto out;
+
+ shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr));
+ } else {
+ if (ihdr->protocol != IPPROTO_UDP)
+ goto out;
+
+ uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl);
+ if (uhdr->dest != htons(tpriv->packet->dport))
+ goto out;
+
+ shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr));
+ }
+
+ if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC))
+ goto out;
+ if (tpriv->packet->id != shdr->id)
+ goto out;
+
+ tpriv->ok = true;
+ complete(&tpriv->comp);
+out:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int __net_test_loopback(struct net_device *ndev,
+ struct net_packet_attrs *attr)
+{
+ struct net_test_priv *tpriv;
+ struct sk_buff *skb = NULL;
+ int ret = 0;
+
+ tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL);
+ if (!tpriv)
+ return -ENOMEM;
+
+ tpriv->ok = false;
+ init_completion(&tpriv->comp);
+
+ tpriv->pt.type = htons(ETH_P_IP);
+ tpriv->pt.func = net_test_loopback_validate;
+ tpriv->pt.dev = ndev;
+ tpriv->pt.af_packet_priv = tpriv;
+ tpriv->packet = attr;
+ dev_add_pack(&tpriv->pt);
+
+ skb = net_test_get_skb(ndev, attr);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+
+ ret = dev_direct_xmit(skb, attr->queue_mapping);
+ if (ret < 0) {
+ goto cleanup;
+ } else if (ret > 0) {
+ ret = -ENETUNREACH;
+ goto cleanup;
+ }
+
+ if (!attr->timeout)
+ attr->timeout = NET_LB_TIMEOUT;
+
+ wait_for_completion_timeout(&tpriv->comp, attr->timeout);
+ ret = tpriv->ok ? 0 : -ETIMEDOUT;
+
+cleanup:
+ dev_remove_pack(&tpriv->pt);
+ kfree(tpriv);
+ return ret;
+}
+
+static int net_test_netif_carrier(struct net_device *ndev)
+{
+ return netif_carrier_ok(ndev) ? 0 : -ENOLINK;
+}
+
+static int net_test_phy_phydev(struct net_device *ndev)
+{
+ return ndev->phydev ? 0 : -EOPNOTSUPP;
+}
+
+static int net_test_phy_loopback_enable(struct net_device *ndev)
+{
+ if (!ndev->phydev)
+ return -EOPNOTSUPP;
+
+ return phy_loopback(ndev->phydev, true);
+}
+
+static int net_test_phy_loopback_disable(struct net_device *ndev)
+{
+ if (!ndev->phydev)
+ return -EOPNOTSUPP;
+
+ return phy_loopback(ndev->phydev, false);
+}
+
+static int net_test_phy_loopback_udp(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static int net_test_phy_loopback_udp_mtu(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.max_size = ndev->mtu;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static int net_test_phy_loopback_tcp(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.tcp = true;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static const struct net_test {
+ char name[ETH_GSTRING_LEN];
+ int (*fn)(struct net_device *ndev);
+} net_selftests[] = {
+ {
+ .name = "Carrier ",
+ .fn = net_test_netif_carrier,
+ }, {
+ .name = "PHY dev is present ",
+ .fn = net_test_phy_phydev,
+ }, {
+ /* This test should be done before all PHY loopback test */
+ .name = "PHY internal loopback, enable ",
+ .fn = net_test_phy_loopback_enable,
+ }, {
+ .name = "PHY internal loopback, UDP ",
+ .fn = net_test_phy_loopback_udp,
+ }, {
+ .name = "PHY internal loopback, MTU ",
+ .fn = net_test_phy_loopback_udp_mtu,
+ }, {
+ .name = "PHY internal loopback, TCP ",
+ .fn = net_test_phy_loopback_tcp,
+ }, {
+ /* This test should be done after all PHY loopback test */
+ .name = "PHY internal loopback, disable",
+ .fn = net_test_phy_loopback_disable,
+ },
+};
+
+void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf)
+{
+ int count = net_selftest_get_count();
+ int i;
+
+ memset(buf, 0, sizeof(*buf) * count);
+ net_test_next_id = 0;
+
+ if (etest->flags != ETH_TEST_FL_OFFLINE) {
+ netdev_err(ndev, "Only offline tests are supported\n");
+ etest->flags |= ETH_TEST_FL_FAILED;
+ return;
+ }
+
+
+ for (i = 0; i < count; i++) {
+ buf[i] = net_selftests[i].fn(ndev);
+ if (buf[i] && (buf[i] != -EOPNOTSUPP))
+ etest->flags |= ETH_TEST_FL_FAILED;
+ }
+}
+EXPORT_SYMBOL_GPL(net_selftest);
+
+int net_selftest_get_count(void)
+{
+ return ARRAY_SIZE(net_selftests);
+}
+EXPORT_SYMBOL_GPL(net_selftest_get_count);
+
+void net_selftest_get_strings(u8 *data)
+{
+ u8 *p = data;
+ int i;
+
+ for (i = 0; i < net_selftest_get_count(); i++) {
+ snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1,
+ net_selftests[i].name);
+ p += ETH_GSTRING_LEN;
+ }
+}
+EXPORT_SYMBOL_GPL(net_selftest_get_strings);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Oleksij Rempel <o.rempel@pengutronix.de>");
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e1101a4f90a6..88fa40571d0c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -60,6 +60,7 @@
#include <linux/prefetch.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
+#include <linux/kcov.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -69,6 +70,8 @@
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>
+#include <net/mctp.h>
+#include <net/page_pool.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -77,7 +80,8 @@
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>
-#include "datagram.h"
+#include "dev.h"
+#include "sock_destructor.h"
struct kmem_cache *skbuff_head_cache __ro_after_init;
static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
@@ -87,6 +91,13 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
+#undef FN
+#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
+const char * const drop_reasons[] = {
+ DEFINE_DROP_REASON(FN, FN)
+};
+EXPORT_SYMBOL(drop_reasons);
+
/**
* skb_panic - private function for out-of-line support
* @skb: buffer
@@ -102,7 +113,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
const char msg[])
{
- pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+ pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
msg, addr, skb->len, sz, skb->head, skb->data,
(unsigned long)skb->tail, (unsigned long)skb->end,
skb->dev ? skb->dev->name : "<NULL>");
@@ -119,148 +130,148 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
skb_panic(skb, sz, addr, __func__);
}
-/*
- * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
- * the caller if emergency pfmemalloc reserves are being used. If it is and
- * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
- * may be used. Otherwise, the packet data may be discarded until enough
- * memory is free
+#define NAPI_SKB_CACHE_SIZE 64
+#define NAPI_SKB_CACHE_BULK 16
+#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
+
+#if PAGE_SIZE == SZ_4K
+
+#define NAPI_HAS_SMALL_PAGE_FRAG 1
+#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
+
+/* specialized page frag allocator using a single order 0 page
+ * and slicing it into 1K sized fragment. Constrained to systems
+ * with a very limited amount of 1K fragments fitting a single
+ * page - to avoid excessive truesize underestimation
*/
-#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
- __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
-static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
- unsigned long ip, bool *pfmemalloc)
+struct page_frag_1k {
+ void *va;
+ u16 offset;
+ bool pfmemalloc;
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
{
- void *obj;
- bool ret_pfmemalloc = false;
+ struct page *page;
+ int offset;
- /*
- * Try a regular allocation, when that fails and we're not entitled
- * to the reserves, fail.
- */
- obj = kmalloc_node_track_caller(size,
- flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
- node);
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
- goto out;
+ offset = nc->offset - SZ_1K;
+ if (likely(offset >= 0))
+ goto use_frag;
- /* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(size, flags, node);
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+ if (!page)
+ return NULL;
-out:
- if (pfmemalloc)
- *pfmemalloc = ret_pfmemalloc;
+ nc->va = page_address(page);
+ nc->pfmemalloc = page_is_pfmemalloc(page);
+ offset = PAGE_SIZE - SZ_1K;
+ page_ref_add(page, offset / SZ_1K);
- return obj;
+use_frag:
+ nc->offset = offset;
+ return nc->va + offset;
}
+#else
-/* Allocate a new skbuff. We do this ourselves so we can fill in a few
- * 'private' fields and also do memory statistics to find all the
- * [BEEP] leaks.
- *
+/* the small page is actually unused in this build; add dummy helpers
+ * to please the compiler and avoid later preprocessor's conditionals
*/
+#define NAPI_HAS_SMALL_PAGE_FRAG 0
+#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
-/**
- * __alloc_skb - allocate a network buffer
- * @size: size to allocate
- * @gfp_mask: allocation mask
- * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
- * instead of head cache and allocate a cloned (child) skb.
- * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
- * allocations in case the data is required for writeback
- * @node: numa node to allocate memory on
- *
- * Allocate a new &sk_buff. The returned buffer has no headroom and a
- * tail room of at least size bytes. The object has a reference count
- * of one. The return is the buffer. On a failure the return is %NULL.
- *
- * Buffers may only be allocated from interrupts using a @gfp_mask of
- * %GFP_ATOMIC.
+struct page_frag_1k {
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
+{
+ return NULL;
+}
+
+#endif
+
+struct napi_alloc_cache {
+ struct page_frag_cache page;
+ struct page_frag_1k page_small;
+ unsigned int skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
*/
-struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int flags, int node)
+void napi_get_frags_check(struct napi_struct *napi)
{
- struct kmem_cache *cache;
- struct skb_shared_info *shinfo;
struct sk_buff *skb;
- u8 *data;
- bool pfmemalloc;
- cache = (flags & SKB_ALLOC_FCLONE)
- ? skbuff_fclone_cache : skbuff_head_cache;
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
- if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
- gfp_mask |= __GFP_MEMALLOC;
+void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- /* Get the HEAD */
- skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
- if (!skb)
- goto out;
- prefetchw(skb);
+ fragsz = SKB_DATA_ALIGN(fragsz);
- /* We do our best to align skb_shared_info on a separate cache
- * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
- * aligned memory blocks, unless SLUB/SLAB debug is enabled.
- * Both skb->head and skb_shared_info are cache line aligned.
- */
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
- if (!data)
- goto nodata;
- /* kmalloc(size) might give us more room than requested.
- * Put skb_shared_info exactly at the end of allocated zone,
- * to allow max possible filling before reallocation.
- */
- size = SKB_WITH_OVERHEAD(ksize(data));
- prefetchw(data + size);
+ return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+}
+EXPORT_SYMBOL(__napi_alloc_frag_align);
- /*
- * Only clear those fields we need to clear, not those that we will
- * actually initialise below. Hence, don't put any more fields after
- * the tail pointer in struct sk_buff!
- */
- memset(skb, 0, offsetof(struct sk_buff, tail));
- /* Account for allocated memory : skb + skb->head */
- skb->truesize = SKB_TRUESIZE(size);
- skb->pfmemalloc = pfmemalloc;
- refcount_set(&skb->users, 1);
- skb->head = data;
- skb->data = data;
- skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
- skb->mac_header = (typeof(skb->mac_header))~0U;
- skb->transport_header = (typeof(skb->transport_header))~0U;
+void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+{
+ void *data;
- /* make sure we initialize shinfo sequentially */
- shinfo = skb_shinfo(skb);
- memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
- atomic_set(&shinfo->dataref, 1);
+ fragsz = SKB_DATA_ALIGN(fragsz);
+ if (in_hardirq() || irqs_disabled()) {
+ struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
- if (flags & SKB_ALLOC_FCLONE) {
- struct sk_buff_fclones *fclones;
+ data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
+ } else {
+ struct napi_alloc_cache *nc;
- fclones = container_of(skb, struct sk_buff_fclones, skb1);
+ local_bh_disable();
+ nc = this_cpu_ptr(&napi_alloc_cache);
+ data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+ local_bh_enable();
+ }
+ return data;
+}
+EXPORT_SYMBOL(__netdev_alloc_frag_align);
- skb->fclone = SKB_FCLONE_ORIG;
- refcount_set(&fclones->fclone_ref, 1);
+static struct sk_buff *napi_skb_cache_get(void)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct sk_buff *skb;
- fclones->skb2.fclone = SKB_FCLONE_CLONE;
+ if (unlikely(!nc->skb_count)) {
+ nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
+ GFP_ATOMIC,
+ NAPI_SKB_CACHE_BULK,
+ nc->skb_cache);
+ if (unlikely(!nc->skb_count))
+ return NULL;
}
-out:
+
+ skb = nc->skb_cache[--nc->skb_count];
+ kasan_unpoison_object_data(skbuff_head_cache, skb);
+
return skb;
-nodata:
- kmem_cache_free(cache, skb);
- skb = NULL;
- goto out;
}
-EXPORT_SYMBOL(__alloc_skb);
/* Caller must provide SKB that is memset cleared */
-static struct sk_buff *__build_skb_around(struct sk_buff *skb,
- void *data, unsigned int frag_size)
+static void __build_skb_around(struct sk_buff *skb, void *data,
+ unsigned int frag_size)
{
struct skb_shared_info *shinfo;
unsigned int size = frag_size ? : ksize(data);
@@ -273,16 +284,16 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb,
skb->head = data;
skb->data = data;
skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
+ skb_set_end_offset(skb, size);
skb->mac_header = (typeof(skb->mac_header))~0U;
skb->transport_header = (typeof(skb->transport_header))~0U;
-
+ skb->alloc_cpu = raw_smp_processor_id();
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
- return skb;
+ skb_set_kcov_handle(skb, kcov_common_handle());
}
/**
@@ -313,8 +324,9 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
return NULL;
memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, frag_size);
- return __build_skb_around(skb, data, frag_size);
+ return skb;
}
/* build_skb() is wrapper over __build_skb(), that specifically
@@ -347,9 +359,9 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
if (unlikely(!skb))
return NULL;
- skb = __build_skb_around(skb, data, frag_size);
+ __build_skb_around(skb, data, frag_size);
- if (skb && frag_size) {
+ if (frag_size) {
skb->head_frag = 1;
if (page_is_pfmemalloc(virt_to_head_page(data)))
skb->pfmemalloc = 1;
@@ -358,56 +370,178 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
}
EXPORT_SYMBOL(build_skb_around);
-#define NAPI_SKB_CACHE_SIZE 64
+/**
+ * __napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Version of __build_skb() that uses NAPI percpu caches to obtain
+ * skbuff_head instead of inplace allocation.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
+{
+ struct sk_buff *skb;
-struct napi_alloc_cache {
- struct page_frag_cache page;
- unsigned int skb_count;
- void *skb_cache[NAPI_SKB_CACHE_SIZE];
-};
+ skb = napi_skb_cache_get();
+ if (unlikely(!skb))
+ return NULL;
-static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, frag_size);
+
+ return skb;
+}
-static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+/**
+ * napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data, or 0 if head was kmalloced
+ *
+ * Version of __napi_build_skb() that takes care of skb->head_frag
+ * and skb->pfmemalloc when the data is a page or page fragment.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct sk_buff *skb = __napi_build_skb(data, frag_size);
- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
+ if (likely(skb) && frag_size) {
+ skb->head_frag = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
+ }
+
+ return skb;
}
+EXPORT_SYMBOL(napi_build_skb);
-void *napi_alloc_frag(unsigned int fragsz)
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
+ bool *pfmemalloc)
{
- fragsz = SKB_DATA_ALIGN(fragsz);
+ void *obj;
+ bool ret_pfmemalloc = false;
+
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
- return __napi_alloc_frag(fragsz, GFP_ATOMIC);
+ return obj;
}
-EXPORT_SYMBOL(napi_alloc_frag);
+
+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
+ * 'private' fields and also do memory statistics to find all the
+ * [BEEP] leaks.
+ *
+ */
/**
- * netdev_alloc_frag - allocate a page fragment
- * @fragsz: fragment size
+ * __alloc_skb - allocate a network buffer
+ * @size: size to allocate
+ * @gfp_mask: allocation mask
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
+ * @node: numa node to allocate memory on
+ *
+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
*
- * Allocates a frag from a page for receive buffer.
- * Uses GFP_ATOMIC allocations.
+ * Buffers may only be allocated from interrupts using a @gfp_mask of
+ * %GFP_ATOMIC.
*/
-void *netdev_alloc_frag(unsigned int fragsz)
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ int flags, int node)
{
- struct page_frag_cache *nc;
- void *data;
+ struct kmem_cache *cache;
+ struct sk_buff *skb;
+ unsigned int osize;
+ bool pfmemalloc;
+ u8 *data;
- fragsz = SKB_DATA_ALIGN(fragsz);
- if (in_irq() || irqs_disabled()) {
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);
- } else {
- local_bh_disable();
- data = __napi_alloc_frag(fragsz, GFP_ATOMIC);
- local_bh_enable();
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
+
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ /* Get the HEAD */
+ if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
+ likely(node == NUMA_NO_NODE || node == numa_mem_id()))
+ skb = napi_skb_cache_get();
+ else
+ skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
+ if (unlikely(!skb))
+ return NULL;
+ prefetchw(skb);
+
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
+ size = SKB_DATA_ALIGN(size);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
+ if (unlikely(!data))
+ goto nodata;
+ /* kmalloc(size) might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ osize = ksize(data);
+ size = SKB_WITH_OVERHEAD(osize);
+ prefetchw(data + size);
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, osize);
+ skb->pfmemalloc = pfmemalloc;
+
+ if (flags & SKB_ALLOC_FCLONE) {
+ struct sk_buff_fclones *fclones;
+
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ skb->fclone = SKB_FCLONE_ORIG;
+ refcount_set(&fclones->fclone_ref, 1);
}
- return data;
+
+ return skb;
+
+nodata:
+ kmem_cache_free(cache, skb);
+ return NULL;
}
-EXPORT_SYMBOL(netdev_alloc_frag);
+EXPORT_SYMBOL(__alloc_skb);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
@@ -432,7 +566,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
len += NET_SKB_PAD;
- if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(1024) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
@@ -446,7 +584,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- if (in_irq() || irqs_disabled()) {
+ if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
@@ -496,37 +634,67 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
gfp_t gfp_mask)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc;
struct sk_buff *skb;
+ bool pfmemalloc;
void *data;
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
len += NET_SKB_PAD + NET_IP_ALIGN;
- if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ * When the small frag allocator is available, prefer it over kmalloc
+ * for small fragments
+ */
+ if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
- skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
+ NUMA_NO_NODE);
if (!skb)
goto skb_fail;
goto skb_success;
}
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
+ nc = this_cpu_ptr(&napi_alloc_cache);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = page_frag_alloc(&nc->page, len, gfp_mask);
+ if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
+ /* we are artificially inflating the allocation size, but
+ * that is not as bad as it may look like, as:
+ * - 'len' less than GRO_MAX_HEAD makes little sense
+ * - On most systems, larger 'len' values lead to fragment
+ * size above 512 bytes
+ * - kmalloc would use the kmalloc-1k slab for such values
+ * - Builds with smaller GRO_MAX_HEAD will very likely do
+ * little networking, as that implies no WiFi and no
+ * tunnels support, and 32 bits arches.
+ */
+ len = SZ_1K;
+
+ data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
+ pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
+ } else {
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ len = SKB_DATA_ALIGN(len);
+
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = nc->page.pfmemalloc;
+ }
+
if (unlikely(!data))
return NULL;
- skb = __build_skb(data, len);
+ skb = __napi_build_skb(data, len);
if (unlikely(!skb)) {
skb_free_frag(data);
return NULL;
}
- if (nc->page.pfmemalloc)
+ if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -584,10 +752,13 @@ static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
- if (skb->head_frag)
+ if (skb->head_frag) {
+ if (skb_pp_recycle(skb, head))
+ return;
skb_free_frag(head);
- else
+ } else {
kfree(head);
+ }
}
static void skb_release_data(struct sk_buff *skb)
@@ -598,16 +769,35 @@ static void skb_release_data(struct sk_buff *skb)
if (skb->cloned &&
atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
&shinfo->dataref))
- return;
+ goto exit;
+
+ if (skb_zcopy(skb)) {
+ bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
+
+ skb_zcopy_clear(skb, true);
+ if (skip_unref)
+ goto free_head;
+ }
for (i = 0; i < shinfo->nr_frags; i++)
- __skb_frag_unref(&shinfo->frags[i]);
+ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+free_head:
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
- skb_zcopy_clear(skb, true);
skb_free_head(skb);
+exit:
+ /* When we clone an SKB we copy the reycling bit. The pp_recycle
+ * bit is only set on the head though, so in order to avoid races
+ * while trying to recycle fragments on __skb_frag_unref() we need
+ * to make one SKB responsible for triggering the recycle path.
+ * So disable the recycling bit if an SKB is cloned and we have
+ * additional references to the fragmented part of the SKB.
+ * Eventually the last SKB will have the recycling bit set and it's
+ * dataref set to 0, which will trigger the recycling
+ */
+ skb->pp_recycle = 0;
}
/*
@@ -647,7 +837,7 @@ void skb_release_head_state(struct sk_buff *skb)
{
skb_dst_drop(skb);
if (skb->destructor) {
- WARN_ON(in_irq());
+ DEBUG_NET_WARN_ON_ONCE(in_hardirq());
skb->destructor(skb);
}
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -681,42 +871,47 @@ void __kfree_skb(struct sk_buff *skb)
EXPORT_SYMBOL(__kfree_skb);
/**
- * kfree_skb - free an sk_buff
+ * kfree_skb_reason - free an sk_buff with special reason
* @skb: buffer to free
+ * @reason: reason why this skb is dropped
*
* Drop a reference to the buffer and free it if the usage count has
- * hit zero.
+ * hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
+ * tracepoint.
*/
-void kfree_skb(struct sk_buff *skb)
+void __fix_address
+kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (!skb_unref(skb))
+ if (unlikely(!skb_unref(skb)))
return;
- trace_kfree_skb(skb, __builtin_return_address(0));
+ DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
+
+ trace_kfree_skb(skb, __builtin_return_address(0), reason);
__kfree_skb(skb);
}
-EXPORT_SYMBOL(kfree_skb);
+EXPORT_SYMBOL(kfree_skb_reason);
-void kfree_skb_list(struct sk_buff *segs)
+void kfree_skb_list_reason(struct sk_buff *segs,
+ enum skb_drop_reason reason)
{
while (segs) {
struct sk_buff *next = segs->next;
- kfree_skb(segs);
+ kfree_skb_reason(segs, reason);
segs = next;
}
}
-EXPORT_SYMBOL(kfree_skb_list);
+EXPORT_SYMBOL(kfree_skb_list_reason);
/* Dump skb information and contents.
*
* Must only be called from net_ratelimit()-ed paths.
*
- * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise.
+ * Dumps whole packets if full_pkt, only headers otherwise.
*/
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
{
- static atomic_t can_dump_full = ATOMIC_INIT(5);
struct skb_shared_info *sh = skb_shinfo(skb);
struct net_device *dev = skb->dev;
struct sock *sk = skb->sk;
@@ -726,9 +921,6 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
int i, len, seg_len;
if (full_pkt)
- full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0;
-
- if (full_pkt)
len = skb->len;
else
len = min_t(int, skb->len, MAX_HEADER + 128);
@@ -758,7 +950,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
if (dev)
- printk("%sdev name=%s feat=0x%pNF\n",
+ printk("%sdev name=%s feat=%pNF\n",
level, dev->name, &dev->features);
if (sk)
printk("%ssk family=%hu type=%u proto=%u\n",
@@ -816,10 +1008,14 @@ EXPORT_SYMBOL(skb_dump);
*/
void skb_tx_error(struct sk_buff *skb)
{
- skb_zcopy_clear(skb, true);
+ if (skb) {
+ skb_zcopy_downgrade_managed(skb);
+ skb_zcopy_clear(skb, true);
+ }
}
EXPORT_SYMBOL(skb_tx_error);
+#ifdef CONFIG_TRACEPOINTS
/**
* consume_skb - free an skbuff
* @skb: buffer to free
@@ -837,9 +1033,10 @@ void consume_skb(struct sk_buff *skb)
__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
+#endif
/**
- * consume_stateless_skb - free an skbuff, assuming it is stateless
+ * __consume_stateless_skb - free an skbuff, assuming it is stateless
* @skb: buffer to free
*
* Alike consume_skb(), but this variant assumes that this is the last
@@ -852,56 +1049,53 @@ void __consume_stateless_skb(struct sk_buff *skb)
kfree_skbmem(skb);
}
-void __kfree_skb_flush(void)
+static void napi_skb_cache_put(struct sk_buff *skb)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ u32 i;
- /* flush skb_cache if containing objects */
- if (nc->skb_count) {
- kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
- nc->skb_cache);
- nc->skb_count = 0;
- }
-}
-
-static inline void _kfree_skb_defer(struct sk_buff *skb)
-{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-
- /* drop skb->head and call any destructors for packet */
- skb_release_all(skb);
-
- /* record skb to CPU local list */
+ kasan_poison_object_data(skbuff_head_cache, skb);
nc->skb_cache[nc->skb_count++] = skb;
-#ifdef CONFIG_SLUB
- /* SLUB writes into objects when freeing */
- prefetchw(skb);
-#endif
-
- /* flush skb_cache if it is filled */
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
- kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
- nc->skb_cache);
- nc->skb_count = 0;
+ for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
+ kasan_unpoison_object_data(skbuff_head_cache,
+ nc->skb_cache[i]);
+
+ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
+ nc->skb_cache + NAPI_SKB_CACHE_HALF);
+ nc->skb_count = NAPI_SKB_CACHE_HALF;
}
}
+
void __kfree_skb_defer(struct sk_buff *skb)
{
- _kfree_skb_defer(skb);
+ skb_release_all(skb);
+ napi_skb_cache_put(skb);
}
-void napi_consume_skb(struct sk_buff *skb, int budget)
+void napi_skb_free_stolen_head(struct sk_buff *skb)
{
- if (unlikely(!skb))
- return;
+ if (unlikely(skb->slow_gro)) {
+ nf_reset_ct(skb);
+ skb_dst_drop(skb);
+ skb_ext_put(skb);
+ skb_orphan(skb);
+ skb->slow_gro = 0;
+ }
+ napi_skb_cache_put(skb);
+}
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
/* Zero budget indicate non-NAPI context called us, like netpoll */
if (unlikely(!budget)) {
dev_consume_skb_any(skb);
return;
}
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+
if (!skb_unref(skb))
return;
@@ -914,16 +1108,15 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- _kfree_skb_defer(skb);
+ skb_release_all(skb);
+ napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
-/* Make sure a field is enclosed inside headers_start/headers_end section */
+/* Make sure a field is contained by headers group */
#define CHECK_SKB_FIELD(field) \
- BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
- offsetof(struct sk_buff, headers_start)); \
- BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
- offsetof(struct sk_buff, headers_end)); \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) != \
+ offsetof(struct sk_buff, headers.field)); \
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
@@ -935,14 +1128,12 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
__skb_ext_copy(new, old);
__nf_copy(new, old, false);
- /* Note : this field could be in headers_start/headers_end section
+ /* Note : this field could be in the headers group.
* It is not yet because we do not want to have a 16 bit hole
*/
new->queue_mapping = old->queue_mapping;
- memcpy(&new->headers_start, &old->headers_start,
- offsetof(struct sk_buff, headers_end) -
- offsetof(struct sk_buff, headers_start));
+ memcpy(&new->headers, &old->headers, sizeof(new->headers));
CHECK_SKB_FIELD(protocol);
CHECK_SKB_FIELD(csum);
CHECK_SKB_FIELD(hash);
@@ -964,6 +1155,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#ifdef CONFIG_NET_RX_BUSY_POLL
CHECK_SKB_FIELD(napi_id);
#endif
+ CHECK_SKB_FIELD(alloc_cpu);
#ifdef CONFIG_XPS
CHECK_SKB_FIELD(sender_cpu);
#endif
@@ -993,6 +1185,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->nohdr = 0;
n->peeked = 0;
C(pfmemalloc);
+ C(pp_recycle);
n->destructor = NULL;
C(tail);
C(end);
@@ -1091,9 +1284,9 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
{
- struct ubuf_info *uarg;
+ struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
WARN_ON_ONCE(!in_task());
@@ -1111,30 +1304,35 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
return NULL;
}
- uarg->callback = sock_zerocopy_callback;
+ uarg->ubuf.callback = msg_zerocopy_callback;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- refcount_set(&uarg->refcnt, 1);
+ uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ refcount_set(&uarg->ubuf.refcnt, 1);
sock_hold(sk);
- return uarg;
+ return &uarg->ubuf;
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
-static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
{
return container_of((void *)uarg, struct sk_buff, cb);
}
-struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg)
+struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg)
{
if (uarg) {
+ struct ubuf_info_msgzc *uarg_zc;
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
+ /* there might be non MSG_ZEROCOPY users */
+ if (uarg->callback != msg_zerocopy_callback)
+ return NULL;
+
/* realloc only when socket is locked (TCP, UDP cork),
* so uarg->len and sk_zckey access is serialized
*/
@@ -1143,8 +1341,9 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
return NULL;
}
- bytelen = uarg->bytelen + size;
- if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ uarg_zc = uarg_to_msgzc(uarg);
+ bytelen = uarg_zc->bytelen + size;
+ if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
/* TCP can create new skb to attach new uarg */
if (sk->sk_type == SOCK_STREAM)
goto new_alloc;
@@ -1152,25 +1351,25 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
}
next = (u32)atomic_read(&sk->sk_zckey);
- if ((u32)(uarg->id + uarg->len) == next) {
- if (mm_account_pinned_pages(&uarg->mmp, size))
+ if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
+ if (mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
- uarg->len++;
- uarg->bytelen = bytelen;
+ uarg_zc->len++;
+ uarg_zc->bytelen = bytelen;
atomic_set(&sk->sk_zckey, ++next);
/* no extra ref when appending to datagram (MSG_MORE) */
if (sk->sk_type == SOCK_STREAM)
- sock_zerocopy_get(uarg);
+ net_zcopy_get(uarg);
return uarg;
}
}
new_alloc:
- return sock_zerocopy_alloc(sk, size);
+ return msg_zerocopy_alloc(sk, size);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
@@ -1192,13 +1391,14 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
return true;
}
-void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
{
struct sk_buff *tail, *skb = skb_from_uarg(uarg);
struct sock_exterr_skb *serr;
struct sock *sk = skb->sk;
struct sk_buff_head *q;
unsigned long flags;
+ bool is_zerocopy;
u32 lo, hi;
u16 len;
@@ -1213,6 +1413,7 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
len = uarg->len;
lo = uarg->id;
hi = uarg->id + len - 1;
+ is_zerocopy = uarg->zerocopy;
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
@@ -1220,7 +1421,7 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
serr->ee.ee_data = hi;
serr->ee.ee_info = lo;
- if (!success)
+ if (!is_zerocopy)
serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
q = &sk->sk_error_queue;
@@ -1233,51 +1434,42 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
}
spin_unlock_irqrestore(&q->lock, flags);
- sk->sk_error_report(sk);
+ sk_error_report(sk);
release:
consume_skb(skb);
sock_put(sk);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
-
-void sock_zerocopy_put(struct ubuf_info *uarg)
-{
- if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
- if (uarg->callback)
- uarg->callback(uarg, uarg->zerocopy);
- else
- consume_skb(skb_from_uarg(uarg));
- }
-}
-EXPORT_SYMBOL_GPL(sock_zerocopy_put);
-void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
+void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success)
{
- if (uarg) {
- struct sock *sk = skb_from_uarg(uarg)->sk;
+ struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
- atomic_dec(&sk->sk_zckey);
- uarg->len--;
+ uarg_zc->zerocopy = uarg_zc->zerocopy & success;
- if (have_uref)
- sock_zerocopy_put(uarg);
- }
+ if (refcount_dec_and_test(&uarg->refcnt))
+ __msg_zerocopy_callback(uarg_zc);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
-int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
+void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
- return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
+ struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;
+
+ atomic_dec(&sk->sk_zckey);
+ uarg_to_msgzc(uarg)->len--;
+
+ if (have_uref)
+ msg_zerocopy_callback(NULL, uarg, true);
}
-EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
+EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg)
{
struct ubuf_info *orig_uarg = skb_zcopy(skb);
- struct iov_iter orig_iter = msg->msg_iter;
int err, orig_len = skb->len;
/* An skb can only point to one uarg. This edge case happens when
@@ -1286,12 +1478,12 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
if (orig_uarg && uarg != orig_uarg)
return -EEXIST;
- err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
/* Streams do not free skb on error. Reset to prev state. */
- msg->msg_iter = orig_iter;
+ iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
skb->sk = sk;
___pskb_trim(skb, orig_len);
skb->sk = save_sk;
@@ -1303,6 +1495,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
+{
+ int i;
+
+ skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+}
+EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
+
static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
gfp_t gfp_mask)
{
@@ -1328,7 +1530,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
* @skb: the skb to modify
* @gfp_mask: allocation priority
*
- * This must be called on SKBTX_DEV_ZEROCOPY skb.
+ * This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
* It will copy all frags into kernel and drop the reference
* to userspace pages.
*
@@ -1440,6 +1642,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
refcount_read(&fclones->fclone_ref) == 1) {
n = &fclones->skb2;
refcount_set(&fclones->fclone_ref, 2);
+ n->fclone = SKB_FCLONE_CLONE;
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
@@ -1620,6 +1823,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
BUG_ON(skb_shared(skb));
+ skb_zcopy_downgrade_managed(skb);
+
size = SKB_DATA_ALIGN(size);
if (skb_pfmemalloc(skb))
@@ -1664,11 +1869,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->head = data;
skb->head_frag = 0;
skb->data += off;
+
+ skb_set_end_offset(skb, size);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
off = nhead;
-#else
- skb->end = skb->head + size;
#endif
skb->tail += off;
skb_headers_offset_update(skb, nhead);
@@ -1716,6 +1920,89 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
}
EXPORT_SYMBOL(skb_realloc_headroom);
+int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
+{
+ unsigned int saved_end_offset, saved_truesize;
+ struct skb_shared_info *shinfo;
+ int res;
+
+ saved_end_offset = skb_end_offset(skb);
+ saved_truesize = skb->truesize;
+
+ res = pskb_expand_head(skb, 0, 0, pri);
+ if (res)
+ return res;
+
+ skb->truesize = saved_truesize;
+
+ if (likely(skb_end_offset(skb) == saved_end_offset))
+ return 0;
+
+ shinfo = skb_shinfo(skb);
+
+ /* We are about to change back skb->end,
+ * we need to move skb_shinfo() to its new location.
+ */
+ memmove(skb->head + saved_end_offset,
+ shinfo,
+ offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
+
+ skb_set_end_offset(skb, saved_end_offset);
+
+ return 0;
+}
+
+/**
+ * skb_expand_head - reallocate header of &sk_buff
+ * @skb: buffer to reallocate
+ * @headroom: needed headroom
+ *
+ * Unlike skb_realloc_headroom, this one does not allocate a new skb
+ * if possible; copies skb->sk to new skb as needed
+ * and frees original skb in case of failures.
+ *
+ * It expect increased headroom and generates warning otherwise.
+ */
+
+struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
+{
+ int delta = headroom - skb_headroom(skb);
+ int osize = skb_end_offset(skb);
+ struct sock *sk = skb->sk;
+
+ if (WARN_ONCE(delta <= 0,
+ "%s is expecting an increase in the headroom", __func__))
+ return skb;
+
+ delta = SKB_DATA_ALIGN(delta);
+ /* pskb_expand_head() might crash, if skb is shared. */
+ if (skb_shared(skb) || !is_skb_wmem(skb)) {
+ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+ if (unlikely(!nskb))
+ goto fail;
+
+ if (sk)
+ skb_set_owner_w(nskb, sk);
+ consume_skb(skb);
+ skb = nskb;
+ }
+ if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
+ goto fail;
+
+ if (sk && is_skb_wmem(skb)) {
+ delta = skb_end_offset(skb) - osize;
+ refcount_add(delta, &sk->sk_wmem_alloc);
+ skb->truesize += delta;
+ }
+ return skb;
+
+fail:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(skb_expand_head);
+
/**
* skb_copy_expand - copy and expand sk_buff
* @skb: buffer to copy
@@ -1903,6 +2190,30 @@ void *skb_pull(struct sk_buff *skb, unsigned int len)
EXPORT_SYMBOL(skb_pull);
/**
+ * skb_pull_data - remove data from the start of a buffer returning its
+ * original position.
+ * @skb: buffer to use
+ * @len: amount of data to remove
+ *
+ * This function removes data from the start of a buffer, returning
+ * the memory to the headroom. A pointer to the original data in the buffer
+ * is returned after checking if there is enough data to pull. Once the
+ * data has been pulled future pushes will overwrite the old data.
+ */
+void *skb_pull_data(struct sk_buff *skb, size_t len)
+{
+ void *data = skb->data;
+
+ if (skb->len < len)
+ return NULL;
+
+ skb_pull(skb, len);
+
+ return data;
+}
+EXPORT_SYMBOL(skb_pull_data);
+
+/**
* skb_trim - remove end from a buffer
* @skb: buffer to alter
* @len: new length
@@ -2016,6 +2327,12 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
skb->csum = csum_block_sub(skb->csum,
skb_checksum(skb, len, delta, 0),
len);
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
+ int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
+
+ if (offset + sizeof(__sum16) > hdlen)
+ return -EINVAL;
}
return __pskb_trim(skb, len);
}
@@ -2123,7 +2440,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
/* Free pulled out fragments. */
while ((list = skb_shinfo(skb)->frag_list) != insp) {
skb_shinfo(skb)->frag_list = list->next;
- kfree_skb(list);
+ consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
@@ -2445,9 +2762,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
}
EXPORT_SYMBOL_GPL(skb_splice_bits);
-/* Send skb data on a socket. Socket must be locked. */
-int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
- int len)
+static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
+ struct kvec *vec, size_t num, size_t size)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (!sock)
+ return -EINVAL;
+ return kernel_sendmsg(sock, msg, vec, num, size);
+}
+
+static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (!sock)
+ return -EINVAL;
+ return kernel_sendpage(sock, page, offset, size, flags);
+}
+
+typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
+ struct kvec *vec, size_t num, size_t size);
+typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags);
+static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
+ int len, sendmsg_func sendmsg, sendpage_func sendpage)
{
unsigned int orig_len = len;
struct sk_buff *head = skb;
@@ -2467,7 +2807,8 @@ do_frag_list:
memset(&msg, 0, sizeof(msg));
msg.msg_flags = MSG_DONTWAIT;
- ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
+ ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
+ sendmsg_unlocked, sk, &msg, &kv, 1, slen);
if (ret <= 0)
goto error;
@@ -2498,9 +2839,11 @@ do_frag_list:
slen = min_t(size_t, len, skb_frag_size(frag) - offset);
while (slen) {
- ret = kernel_sendpage_locked(sk, skb_frag_page(frag),
- skb_frag_off(frag) + offset,
- slen, MSG_DONTWAIT);
+ ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
+ sendpage_unlocked, sk,
+ skb_frag_page(frag),
+ skb_frag_off(frag) + offset,
+ slen, MSG_DONTWAIT);
if (ret <= 0)
goto error;
@@ -2532,8 +2875,23 @@ out:
error:
return orig_len == len ? ret : orig_len - len;
}
+
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+ int len)
+{
+ return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
+ kernel_sendpage_locked);
+}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+/* Send skb data on a socket. Socket must be unlocked. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
+ sendpage_unlocked);
+}
+
/**
* skb_store_bits - store bits from kernel buffer to skb
* @skb: destination buffer
@@ -2723,19 +3081,20 @@ EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
- u8 *to, int len, __wsum csum)
+ u8 *to, int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int pos = 0;
+ __wsum csum = 0;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
csum = csum_partial_copy_nocheck(skb->data + offset, to,
- copy, csum);
+ copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -2765,7 +3124,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
vaddr = kmap_atomic(p);
csum2 = csum_partial_copy_nocheck(vaddr + p_off,
to + copied,
- p_len, 0);
+ p_len);
kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
pos += p_len;
@@ -2791,7 +3150,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
copy = len;
csum2 = skb_copy_and_csum_bits(frag_iter,
offset - start,
- to, copy, 0);
+ to, copy);
csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
@@ -2905,8 +3264,11 @@ skb_zerocopy_headlen(const struct sk_buff *from)
if (!from->head_frag ||
skb_headlen(from) < L1_CACHE_BYTES ||
- skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
hlen = skb_headlen(from);
+ if (!hlen)
+ hlen = from->len;
+ }
if (skb_has_frag_list(from))
hlen = from->len;
@@ -2965,9 +3327,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
}
}
- to->truesize += len + plen;
- to->len += len + plen;
- to->data_len += len + plen;
+ skb_len_add(to, len + plen);
if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
skb_tx_error(from);
@@ -3011,7 +3371,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
csum = 0;
if (csstart != skb->len)
csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
- skb->len - csstart, 0);
+ skb->len - csstart);
if (skb->ip_summed == CHECKSUM_PARTIAL) {
long csstuff = csstart + skb->csum_offset;
@@ -3257,9 +3617,11 @@ static inline void skb_split_no_header(struct sk_buff *skb,
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
+ const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
+
+ skb_zcopy_downgrade_managed(skb);
- skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
- SKBTX_SHARED_FRAG;
+ skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
@@ -3274,7 +3636,7 @@ EXPORT_SYMBOL(skb_split);
*/
static int skb_prepare_for_shift(struct sk_buff *skb)
{
- return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
}
/**
@@ -3386,7 +3748,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragto = &skb_shinfo(tgt)->frags[merge];
skb_frag_size_add(fragto, skb_frag_size(fragfrom));
- __skb_frag_unref(fragfrom);
+ __skb_frag_unref(fragfrom, skb->pp_recycle);
}
/* Reposition in the original skb */
@@ -3404,13 +3766,8 @@ onlymerged:
tgt->ip_summed = CHECKSUM_PARTIAL;
skb->ip_summed = CHECKSUM_PARTIAL;
- /* Yak, is it really working this way? Some helper please? */
- skb->len -= shiftlen;
- skb->data_len -= shiftlen;
- skb->truesize -= shiftlen;
- tgt->len += shiftlen;
- tgt->data_len += shiftlen;
- tgt->truesize += shiftlen;
+ skb_len_add(skb, -shiftlen);
+ skb_len_add(tgt, shiftlen);
return shiftlen;
}
@@ -3433,6 +3790,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
st->root_skb = st->cur_skb = skb;
st->frag_idx = st->stepped_offset = 0;
st->frag_data = NULL;
+ st->frag_off = 0;
}
EXPORT_SYMBOL(skb_prepare_seq_read);
@@ -3487,14 +3845,27 @@ next_skb:
st->stepped_offset += skb_headlen(st->cur_skb);
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+ unsigned int pg_idx, pg_off, pg_sz;
+
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
- block_limit = skb_frag_size(frag) + st->stepped_offset;
+ pg_idx = 0;
+ pg_off = skb_frag_off(frag);
+ pg_sz = skb_frag_size(frag);
+
+ if (skb_frag_must_loop(skb_frag_page(frag))) {
+ pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
+ pg_off = offset_in_page(pg_off + st->frag_off);
+ pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
+ PAGE_SIZE - pg_off);
+ }
+
+ block_limit = pg_sz + st->stepped_offset;
if (abs_offset < block_limit) {
if (!st->frag_data)
- st->frag_data = kmap_atomic(skb_frag_page(frag));
+ st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
- *data = (u8 *) st->frag_data + skb_frag_off(frag) +
+ *data = (u8 *)st->frag_data + pg_off +
(abs_offset - st->stepped_offset);
return block_limit - abs_offset;
@@ -3505,8 +3876,12 @@ next_skb:
st->frag_data = NULL;
}
- st->frag_idx++;
- st->stepped_offset += skb_frag_size(frag);
+ st->stepped_offset += pg_sz;
+ st->frag_off += pg_sz;
+ if (st->frag_off == skb_frag_size(frag)) {
+ st->frag_off = 0;
+ st->frag_idx++;
+ }
}
if (st->frag_data) {
@@ -3574,6 +3949,8 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
struct ts_state state;
unsigned int ret;
+ BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
+
config->get_next_block = skb_ts_get_next_block;
config->finish = skb_ts_finish;
@@ -3592,8 +3969,9 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
if (skb_can_coalesce(skb, i, page, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
} else if (i < MAX_SKB_FRAGS) {
+ skb_zcopy_downgrade_managed(skb);
get_page(page);
- skb_fill_page_desc(skb, i, page, offset, size);
+ skb_fill_page_desc_noacc(skb, i, page, offset, size);
} else {
return -EMSGSIZE;
}
@@ -3646,7 +4024,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
unsigned int delta_truesize = 0;
unsigned int delta_len = 0;
struct sk_buff *tail = NULL;
- struct sk_buff *nskb;
+ struct sk_buff *nskb, *tmp;
+ int len_diff, err;
skb_push(skb, -skb_network_offset(skb) + offset);
@@ -3656,21 +4035,41 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
nskb = list_skb;
list_skb = list_skb->next;
+ err = 0;
+ delta_truesize += nskb->truesize;
+ if (skb_shared(nskb)) {
+ tmp = skb_clone(nskb, GFP_ATOMIC);
+ if (tmp) {
+ consume_skb(nskb);
+ nskb = tmp;
+ err = skb_unclone(nskb, GFP_ATOMIC);
+ } else {
+ err = -ENOMEM;
+ }
+ }
+
if (!tail)
skb->next = nskb;
else
tail->next = nskb;
+ if (unlikely(err)) {
+ nskb->next = list_skb;
+ goto err_linearize;
+ }
+
tail = nskb;
delta_len += nskb->len;
- delta_truesize += nskb->truesize;
skb_push(nskb, -skb_network_offset(nskb) + offset);
- __copy_skb_header(nskb, skb);
+ skb_release_head_state(nskb);
+ len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
+ __copy_skb_header(nskb, skb);
skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
+ nskb->transport_header += len_diff;
skb_copy_from_linear_data_offset(skb, -tnl_hlen,
nskb->data - tnl_hlen,
offset + tnl_hlen);
@@ -3704,30 +4103,6 @@ err_linearize:
}
EXPORT_SYMBOL_GPL(skb_segment_list);
-int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
-{
- if (unlikely(p->len + skb->len >= 65536))
- return -E2BIG;
-
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
-
- skb_pull(skb, skb_gro_offset(skb));
-
- NAPI_GRO_CB(p)->last = skb;
- NAPI_GRO_CB(p)->count++;
- p->data_len += skb->len;
- p->truesize += skb->truesize;
- p->len += skb->len;
-
- NAPI_GRO_CB(skb)->same_flow = 1;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(skb_gro_receive_list);
-
/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
@@ -3758,29 +4133,30 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
int err = -ENOMEM;
int i = 0;
int pos;
- int dummy;
-
- if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) &&
- (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
- /* gso_size is untrusted, and we have a frag_list with a linear
- * non head_frag head.
- *
- * (we assume checking the first list_skb member suffices;
- * i.e if either of the list_skb members have non head_frag
- * head, then the first one has too).
- *
- * If head_skb's headlen does not fit requested gso_size, it
- * means that the frag_list members do NOT terminate on exact
- * gso_size boundaries. Hence we cannot perform skb_frag_t page
- * sharing. Therefore we must fallback to copying the frag_list
- * skbs; we do so by disabling SG.
- */
- if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb))
- features &= ~NETIF_F_SG;
+
+ if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
+ mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
+ struct sk_buff *check_skb;
+
+ for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
+ if (skb_headlen(check_skb) && !check_skb->head_frag) {
+ /* gso_size is untrusted, and we have a frag_list with
+ * a linear non head_frag item.
+ *
+ * If head_skb's headlen does not fit requested gso_size,
+ * it means that the frag_list members do NOT terminate
+ * on exact gso_size boundaries. Hence we cannot perform
+ * skb_frag_t page sharing. Therefore we must fallback to
+ * copying the frag_list skbs; we do so by disabling SG.
+ */
+ features &= ~NETIF_F_SG;
+ break;
+ }
+ }
}
__skb_push(head_skb, doffset);
- proto = skb_network_protocol(head_skb, &dummy);
+ proto = skb_network_protocol(head_skb, NULL);
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
@@ -3848,12 +4224,8 @@ normal:
}
hsize = skb_headlen(head_skb) - offset;
- if (hsize < 0)
- hsize = 0;
- if (hsize > len || !sg)
- hsize = len;
- if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+ if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len);
@@ -3896,6 +4268,11 @@ normal:
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
+ if (hsize < 0)
+ hsize = 0;
+ if (hsize > len || !sg)
+ hsize = len;
+
nskb = __alloc_skb(hsize + doffset + headroom,
GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
NUMA_NO_NODE);
@@ -3926,14 +4303,20 @@ normal:
goto perform_csum_check;
if (!sg) {
- if (!nskb->remcsum_offload)
- nskb->ip_summed = CHECKSUM_NONE;
- SKB_GSO_CB(nskb)->csum =
- skb_copy_and_csum_bits(head_skb, offset,
- skb_put(nskb, len),
- len, 0);
- SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ if (!csum) {
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb,
+ len),
+ len);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ } else {
+ if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
+ goto err;
+ }
continue;
}
@@ -3942,8 +4325,8 @@ normal:
skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
- skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
- SKBTX_SHARED_FRAG;
+ skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
+ SKBFL_SHARED_FRAG;
if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
@@ -4073,118 +4456,6 @@ err:
}
EXPORT_SYMBOL_GPL(skb_segment);
-int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
-{
- struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
- unsigned int offset = skb_gro_offset(skb);
- unsigned int headlen = skb_headlen(skb);
- unsigned int len = skb_gro_len(skb);
- unsigned int delta_truesize;
- struct sk_buff *lp;
-
- if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
- return -E2BIG;
-
- lp = NAPI_GRO_CB(p)->last;
- pinfo = skb_shinfo(lp);
-
- if (headlen <= offset) {
- skb_frag_t *frag;
- skb_frag_t *frag2;
- int i = skbinfo->nr_frags;
- int nr_frags = pinfo->nr_frags + i;
-
- if (nr_frags > MAX_SKB_FRAGS)
- goto merge;
-
- offset -= headlen;
- pinfo->nr_frags = nr_frags;
- skbinfo->nr_frags = 0;
-
- frag = pinfo->frags + nr_frags;
- frag2 = skbinfo->frags + i;
- do {
- *--frag = *--frag2;
- } while (--i);
-
- skb_frag_off_add(frag, offset);
- skb_frag_size_sub(frag, offset);
-
- /* all fragments truesize : remove (head size + sk_buff) */
- delta_truesize = skb->truesize -
- SKB_TRUESIZE(skb_end_offset(skb));
-
- skb->truesize -= skb->data_len;
- skb->len -= skb->data_len;
- skb->data_len = 0;
-
- NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
- goto done;
- } else if (skb->head_frag) {
- int nr_frags = pinfo->nr_frags;
- skb_frag_t *frag = pinfo->frags + nr_frags;
- struct page *page = virt_to_head_page(skb->head);
- unsigned int first_size = headlen - offset;
- unsigned int first_offset;
-
- if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
- goto merge;
-
- first_offset = skb->data -
- (unsigned char *)page_address(page) +
- offset;
-
- pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
-
- __skb_frag_set_page(frag, page);
- skb_frag_off_set(frag, first_offset);
- skb_frag_size_set(frag, first_size);
-
- memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
- /* We dont need to clear skbinfo->nr_frags here */
-
- delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
- NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
- goto done;
- }
-
-merge:
- delta_truesize = skb->truesize;
- if (offset > headlen) {
- unsigned int eat = offset - headlen;
-
- skb_frag_off_add(&skbinfo->frags[0], eat);
- skb_frag_size_sub(&skbinfo->frags[0], eat);
- skb->data_len -= eat;
- skb->len -= eat;
- offset = headlen;
- }
-
- __skb_pull(skb, offset);
-
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
- NAPI_GRO_CB(p)->last = skb;
- __skb_header_release(skb);
- lp = p;
-
-done:
- NAPI_GRO_CB(p)->count++;
- p->data_len += len;
- p->truesize += delta_truesize;
- p->len += len;
- if (lp != p) {
- lp->data_len += len;
- lp->truesize += delta_truesize;
- lp->len += len;
- }
- NAPI_GRO_CB(skb)->same_flow = 1;
- return 0;
-}
-EXPORT_SYMBOL_GPL(skb_gro_receive);
-
#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE 8
#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
@@ -4202,6 +4473,9 @@ static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_MPTCP)
[SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
#endif
+#if IS_ENABLED(CONFIG_MCTP_FLOWS)
+ [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
+#endif
};
static __always_inline unsigned int skb_ext_total_length(void)
@@ -4219,6 +4493,9 @@ static __always_inline unsigned int skb_ext_total_length(void)
#if IS_ENABLED(CONFIG_MPTCP)
skb_ext_type_len[SKB_EXT_MPTCP] +
#endif
+#if IS_ENABLED(CONFIG_MCTP_FLOWS)
+ skb_ext_type_len[SKB_EXT_MCTP] +
+#endif
0;
}
@@ -4407,7 +4684,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
* at the moment even if they are anonymous).
*/
if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
- __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
+ !__pskb_pull_tail(skb, __skb_pagelen(skb)))
return -ENOMEM;
/* Easy case. Most of packets will go this way. */
@@ -4525,7 +4802,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
@@ -4548,7 +4825,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
if (skb && (skb_next = skb_peek(q))) {
icmp_next = is_icmp_err_skb(skb_next);
if (icmp_next)
- sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin;
+ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
}
spin_unlock_irqrestore(&q->lock, flags);
@@ -4556,7 +4833,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
sk->sk_err = 0;
if (skb_next)
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return skb;
}
@@ -4615,9 +4892,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM)
- serr->ee.ee_data -= sk->sk_tskey;
+ if (sk_is_tcp(sk))
+ serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
}
err = sock_queue_err_skb(sk, skb);
@@ -4630,7 +4906,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
bool ret;
- if (likely(sysctl_tstamp_allow_data || tsonly))
+ if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
return true;
read_lock_bh(&sk->sk_callback_lock);
@@ -4664,6 +4940,7 @@ err:
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
@@ -4684,9 +4961,9 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (tsonly) {
#ifdef CONFIG_INET
if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
- sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
- skb = tcp_get_timestamping_opt_stats(sk);
+ sk_is_tcp(sk)) {
+ skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
+ ack_skb);
opt_stats = true;
} else
#endif
@@ -4706,7 +4983,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (hwtstamps)
*skb_hwtstamps(skb) = *hwtstamps;
else
- skb->tstamp = ktime_get_real();
+ __net_timestamp(skb);
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
@@ -4715,7 +4992,7 @@ EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps)
{
- return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
+ return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
@@ -4848,7 +5125,7 @@ static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
if (err < 0)
goto out;
- if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF))
+ if (ip_is_fragment(ip_hdr(skb)))
fragment = true;
off = ip_hdrlen(skb);
@@ -5130,6 +5407,20 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (skb_cloned(to))
return false;
+ /* In general, avoid mixing slab allocated and page_pool allocated
+ * pages within the same SKB. However when @to is not pp_recycle and
+ * @from is cloned, we can transition frag pages from page_pool to
+ * reference counted.
+ *
+ * On the other hand, don't allow coalescing two pp_recycle SKBs if
+ * @from is cloned, in case the SKB is using page_pool fragment
+ * references (PP_FLAG_PAGE_FRAG). Since we only take full page
+ * references for cloned SKBs at the moment that would result in
+ * inconsistent reference counts.
+ */
+ if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from)))
+ return false;
+
if (len <= skb_tailroom(to)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
@@ -5229,7 +5520,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
ipvs_reset(skb);
skb->mark = 0;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
@@ -5413,8 +5704,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
goto err_free;
-
- if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
+ /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
+ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
goto err_free;
vhdr = (struct vlan_hdr *)skb->data;
@@ -5429,7 +5720,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
goto err_free;
skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
return skb;
@@ -5440,7 +5732,7 @@ err_free:
}
EXPORT_SYMBOL(skb_vlan_untag);
-int skb_ensure_writable(struct sk_buff *skb, int write_len)
+int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
{
if (!pskb_may_pull(skb, write_len))
return -ENOMEM;
@@ -5554,6 +5846,73 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
}
EXPORT_SYMBOL(skb_vlan_push);
+/**
+ * skb_eth_pop() - Drop the Ethernet header at the head of a packet
+ *
+ * @skb: Socket buffer to modify
+ *
+ * Drop the Ethernet header of @skb.
+ *
+ * Expects that skb->data points to the mac header and that no VLAN tags are
+ * present.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_eth_pop(struct sk_buff *skb)
+{
+ if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
+ skb_network_offset(skb) < ETH_HLEN)
+ return -EPROTO;
+
+ skb_pull_rcsum(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_eth_pop);
+
+/**
+ * skb_eth_push() - Add a new Ethernet header at the head of a packet
+ *
+ * @skb: Socket buffer to modify
+ * @dst: Destination MAC address of the new header
+ * @src: Source MAC address of the new header
+ *
+ * Prepend @skb with a new Ethernet header.
+ *
+ * Expects that skb->data points to the mac header, which must be empty.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
+ const unsigned char *src)
+{
+ struct ethhdr *eth;
+ int err;
+
+ if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
+ return -EPROTO;
+
+ err = skb_cow_head(skb, sizeof(*eth));
+ if (err < 0)
+ return err;
+
+ skb_push(skb, sizeof(*eth));
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ eth = eth_hdr(skb);
+ ether_addr_copy(eth->h_dest, dst);
+ ether_addr_copy(eth->h_source, src);
+ eth->h_proto = skb->protocol;
+
+ skb_postpush_rcsum(skb, eth, sizeof(*eth));
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_eth_push);
+
/* Update the ethertype of hdr and the skb csum value if required. */
static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
__be16 ethertype)
@@ -5615,7 +5974,7 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
lse->label_stack_entry = mpls_lse;
skb_postpush_rcsum(skb, lse, MPLS_HLEN);
- if (ethernet)
+ if (ethernet && mac_len >= ETH_HLEN)
skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
skb->protocol = mpls_proto;
@@ -5655,7 +6014,7 @@ int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
- if (ethernet) {
+ if (ethernet && mac_len >= ETH_HLEN) {
struct ethhdr *hdr;
/* use mpls_hdr() to get ethertype to account for VLANs. */
@@ -5718,6 +6077,9 @@ int skb_mpls_dec_ttl(struct sk_buff *skb)
if (unlikely(!eth_p_mpls(skb->protocol)))
return -EINVAL;
+ if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
+ return -ENOMEM;
+
lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
if (!--ttl)
@@ -5852,11 +6214,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb->head = data;
skb->data = data;
skb->head_frag = 0;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
-#else
- skb->end = skb->head + size;
-#endif
+ skb_set_end_offset(skb, size);
skb_set_tail_pointer(skb, skb_headlen(skb));
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
@@ -5913,7 +6271,7 @@ static int pskb_carve_frag_list(struct sk_buff *skb,
/* Free pulled out fragments. */
while ((list = shinfo->frag_list) != insp) {
shinfo->frag_list = list->next;
- kfree_skb(list);
+ consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
@@ -5948,8 +6306,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
size = SKB_WITH_OVERHEAD(ksize(data));
memcpy((struct skb_shared_info *)(data + size),
- skb_shinfo(skb), offsetof(struct skb_shared_info,
- frags[skb_shinfo(skb)->nr_frags]));
+ skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
if (skb_orphan_frags(skb, gfp_mask)) {
kfree(data);
return -ENOMEM;
@@ -5982,20 +6339,20 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- if (k == 0) {
- /* split line is in frag list */
- pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
+ /* split line is in frag list */
+ if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
+ /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
+ if (skb_has_frag_list(skb))
+ kfree_skb_list(skb_shinfo(skb)->frag_list);
+ kfree(data);
+ return -ENOMEM;
}
skb_release_data(skb);
skb->head = data;
skb->head_frag = 0;
skb->data = data;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
-#else
- skb->end = skb->head + size;
-#endif
+ skb_set_end_offset(skb, size);
skb_reset_tail_pointer(skb);
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
@@ -6079,13 +6436,15 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
/**
* __skb_ext_alloc - allocate a new skb extensions storage
*
+ * @flags: See kmalloc().
+ *
* Returns the newly allocated pointer. The pointer can later attached to a
* skb via __skb_ext_set().
* Note: caller must handle the skb_ext as an opaque data.
*/
-struct skb_ext *__skb_ext_alloc(void)
+struct skb_ext *__skb_ext_alloc(gfp_t flags)
{
- struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+ struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
if (new) {
memset(new->offset, 0, sizeof(new->offset));
@@ -6180,7 +6539,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
} else {
newoff = SKB_EXT_CHUNKSIZEOF(*new);
- new = __skb_ext_alloc();
+ new = __skb_ext_alloc(GFP_ATOMIC);
if (!new)
return NULL;
}
@@ -6189,6 +6548,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
new->chunks = newlen;
new->offset[id] = newoff;
set_active:
+ skb->slow_gro = 1;
skb->extensions = new;
skb->active_extensions |= 1 << id;
return skb_ext_get_ptr(new, id);
@@ -6205,6 +6565,14 @@ static void skb_ext_put_sp(struct sec_path *sp)
}
#endif
+#ifdef CONFIG_MCTP_FLOWS
+static void skb_ext_put_mctp(struct mctp_flow *flow)
+{
+ if (flow->key)
+ mctp_key_unref(flow->key);
+}
+#endif
+
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
struct skb_ext *ext = skb->extensions;
@@ -6240,8 +6608,58 @@ free_now:
if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
#endif
+#ifdef CONFIG_MCTP_FLOWS
+ if (__skb_ext_exist(ext, SKB_EXT_MCTP))
+ skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
+#endif
kmem_cache_free(skbuff_ext_cache, ext);
}
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */
+
+/**
+ * skb_attempt_defer_free - queue skb for remote freeing
+ * @skb: buffer
+ *
+ * Put @skb in a per-cpu list, using the cpu which
+ * allocated the skb/pages to reduce false sharing
+ * and memory zone spinlock contention.
+ */
+void skb_attempt_defer_free(struct sk_buff *skb)
+{
+ int cpu = skb->alloc_cpu;
+ struct softnet_data *sd;
+ unsigned long flags;
+ unsigned int defer_max;
+ bool kick;
+
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+ !cpu_online(cpu) ||
+ cpu == raw_smp_processor_id()) {
+nodefer: __kfree_skb(skb);
+ return;
+ }
+
+ sd = &per_cpu(softnet_data, cpu);
+ defer_max = READ_ONCE(sysctl_skb_defer_max);
+ if (READ_ONCE(sd->defer_count) >= defer_max)
+ goto nodefer;
+
+ spin_lock_irqsave(&sd->defer_lock, flags);
+ /* Send an IPI every time queue reaches half capacity. */
+ kick = sd->defer_count == (defer_max >> 1);
+ /* Paired with the READ_ONCE() few lines above */
+ WRITE_ONCE(sd->defer_count, sd->defer_count + 1);
+
+ skb->next = sd->defer_list;
+ /* Paired with READ_ONCE() in skb_defer_free_flush() */
+ WRITE_ONCE(sd->defer_list, skb);
+ spin_unlock_irqrestore(&sd->defer_lock, flags);
+
+ /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
+ * if we are unlucky enough (this seems very unlikely).
+ */
+ if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
+ smp_call_function_single_async(cpu, &sd->defer_csd);
+}
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index ded2d5227678..e6b9ced3eda8 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -7,6 +7,7 @@
#include <net/sock.h>
#include <net/tcp.h>
+#include <net/tls.h>
static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
{
@@ -26,6 +27,7 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
int elem_first_coalesce)
{
struct page_frag *pfrag = sk_page_frag(sk);
+ u32 osize = msg->sg.size;
int ret = 0;
len -= msg->sg.size;
@@ -34,13 +36,17 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
u32 orig_offset;
int use, i;
- if (!sk_page_frag_refill(sk, pfrag))
- return -ENOMEM;
+ if (!sk_page_frag_refill(sk, pfrag)) {
+ ret = -ENOMEM;
+ goto msg_trim;
+ }
orig_offset = pfrag->offset;
use = min_t(int, len, pfrag->size - orig_offset);
- if (!sk_wmem_schedule(sk, use))
- return -ENOMEM;
+ if (!sk_wmem_schedule(sk, use)) {
+ ret = -ENOMEM;
+ goto msg_trim;
+ }
i = msg->sg.end;
sk_msg_iter_var_prev(i);
@@ -70,6 +76,10 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
}
return ret;
+
+msg_trim:
+ sk_msg_trim(sk, msg, osize);
+ return ret;
}
EXPORT_SYMBOL_GPL(sk_msg_alloc);
@@ -169,10 +179,12 @@ static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
struct scatterlist *sge = sk_msg_elem(msg, i);
u32 len = sge->length;
- if (charge)
- sk_mem_uncharge(sk, len);
- if (!msg->skb)
+ /* When the skb owns the memory we free it from consume_skb path. */
+ if (!msg->skb) {
+ if (charge)
+ sk_mem_uncharge(sk, len);
put_page(sg_page(sge));
+ }
memset(sge, 0, sizeof(*sge));
return len;
}
@@ -312,14 +324,13 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
goto out;
}
- copied = iov_iter_get_pages(from, pages, bytes, maxpages,
+ copied = iov_iter_get_pages2(from, pages, bytes, maxpages,
&offset);
if (copied <= 0) {
ret = -EFAULT;
goto out;
}
- iov_iter_advance(from, copied);
bytes -= copied;
msg->sg.size += copied;
@@ -396,29 +407,146 @@ out:
}
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
-static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+/* Receive sk_msg from psock->ingress_msg to @msg. */
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int peek = flags & MSG_PEEK;
+ struct sk_msg *msg_rx;
+ int i, copied = 0;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ while (copied != len) {
+ struct scatterlist *sge;
+
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (!copy) {
+ copied = copied ? copied : -EFAULT;
+ goto out;
+ }
+
+ copied += copy;
+ if (likely(!peek)) {
+ sge->offset += copy;
+ sge->length -= copy;
+ if (!msg_rx->skb)
+ sk_mem_uncharge(sk, copy);
+ msg_rx->sg.size -= copy;
+
+ if (!sge->length) {
+ sk_msg_iter_var_next(i);
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+ } else {
+ /* Lets not optimize peek case if copy_page_to_iter
+ * didn't copy the entire length lets just break.
+ */
+ if (copy != sge->length)
+ goto out;
+ sk_msg_iter_var_next(i);
+ }
+
+ if (copied == len)
+ break;
+ } while ((i != msg_rx->sg.end) && !sg_is_last(sge));
+
+ if (unlikely(peek)) {
+ msg_rx = sk_psock_next_msg(psock, msg_rx);
+ if (!msg_rx)
+ break;
+ continue;
+ }
+
+ msg_rx->sg.start = i;
+ if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) {
+ msg_rx = sk_psock_dequeue_msg(psock);
+ kfree_sk_msg(msg_rx);
+ }
+ msg_rx = sk_psock_peek_msg(psock);
+ }
+out:
+ if (psock->work_state.skb && copied > 0)
+ schedule_work(&psock->work);
+ return copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
+
+bool sk_msg_is_readable(struct sock *sk)
+{
+ struct sk_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock))
+ empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
+ return !empty;
+}
+EXPORT_SYMBOL_GPL(sk_msg_is_readable);
+
+static struct sk_msg *alloc_sk_msg(gfp_t gfp)
{
- struct sock *sk = psock->sk;
- int copied = 0, num_sge;
struct sk_msg *msg;
- msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+ msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN);
if (unlikely(!msg))
- return -EAGAIN;
- if (!sk_rmem_schedule(sk, skb, skb->len)) {
- kfree(msg);
- return -EAGAIN;
- }
+ return NULL;
+ sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS);
+ return msg;
+}
- sk_msg_init(msg);
- num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
- if (unlikely(num_sge < 0)) {
- kfree(msg);
- return num_sge;
+static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ return NULL;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize))
+ return NULL;
+
+ return alloc_sk_msg(GFP_KERNEL);
+}
+
+static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
+ u32 off, u32 len,
+ struct sk_psock *psock,
+ struct sock *sk,
+ struct sk_msg *msg)
+{
+ int num_sge, copied;
+
+ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
+ if (num_sge < 0) {
+ /* skb linearize may fail with ENOMEM, but lets simply try again
+ * later if this happens. Under memory pressure we don't want to
+ * drop the skb. We need to linearize the skb so that the mapping
+ * in skb_to_sgvec can not error.
+ */
+ if (skb_linearize(skb))
+ return -EAGAIN;
+
+ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
+ if (unlikely(num_sge < 0))
+ return num_sge;
}
- sk_mem_charge(sk, skb->len);
- copied = skb->len;
+ copied = len;
msg->sg.start = 0;
msg->sg.size = copied;
msg->sg.end = num_sge;
@@ -429,55 +557,134 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
return copied;
}
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len)
+{
+ struct sock *sk = psock->sk;
+ struct sk_msg *msg;
+ int err;
+
+ /* If we are receiving on the same sock skb->sk is already assigned,
+ * skip memory accounting and owner transition seeing it already set
+ * correctly.
+ */
+ if (unlikely(skb->sk == sk))
+ return sk_psock_skb_ingress_self(psock, skb, off, len);
+ msg = sk_psock_create_ingress_msg(sk, skb);
+ if (!msg)
+ return -EAGAIN;
+
+ /* This will transition ownership of the data from the socket where
+ * the BPF program was run initiating the redirect to the socket
+ * we will eventually receive this data on. The data will be released
+ * from skb_consume found in __tcp_bpf_recvmsg() after its been copied
+ * into user buffers.
+ */
+ skb_set_owner_r(skb, sk);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
+ if (err < 0)
+ kfree(msg);
+ return err;
+}
+
+/* Puts an skb on the ingress queue of the socket already assigned to the
+ * skb. In this case we do not need to check memory limits or skb_set_owner_r
+ * because the skb is already accounted for here.
+ */
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len)
+{
+ struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
+ struct sock *sk = psock->sk;
+ int err;
+
+ if (unlikely(!msg))
+ return -EAGAIN;
+ skb_set_owner_r(skb, sk);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
+ if (err < 0)
+ kfree(msg);
+ return err;
+}
+
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress)
{
- if (ingress)
- return sk_psock_skb_ingress(psock, skb);
- else
- return skb_send_sock_locked(psock->sk, skb, off, len);
+ if (!ingress) {
+ if (!sock_writeable(psock->sk))
+ return -EAGAIN;
+ return skb_send_sock(psock->sk, skb, off, len);
+ }
+ return sk_psock_skb_ingress(psock, skb, off, len);
+}
+
+static void sk_psock_skb_state(struct sk_psock *psock,
+ struct sk_psock_work_state *state,
+ struct sk_buff *skb,
+ int len, int off)
+{
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ state->skb = skb;
+ state->len = len;
+ state->off = off;
+ } else {
+ sock_drop(psock->sk, skb);
+ }
+ spin_unlock_bh(&psock->ingress_lock);
}
static void sk_psock_backlog(struct work_struct *work)
{
struct sk_psock *psock = container_of(work, struct sk_psock, work);
struct sk_psock_work_state *state = &psock->work_state;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
bool ingress;
u32 len, off;
int ret;
- /* Lock sock to avoid losing sk_socket during loop. */
- lock_sock(psock->sk);
- if (state->skb) {
+ mutex_lock(&psock->work_mutex);
+ if (unlikely(state->skb)) {
+ spin_lock_bh(&psock->ingress_lock);
skb = state->skb;
len = state->len;
off = state->off;
state->skb = NULL;
- goto start;
+ spin_unlock_bh(&psock->ingress_lock);
}
+ if (skb)
+ goto start;
while ((skb = skb_dequeue(&psock->ingress_skb))) {
len = skb->len;
off = 0;
+ if (skb_bpf_strparser(skb)) {
+ struct strp_msg *stm = strp_msg(skb);
+
+ off = stm->offset;
+ len = stm->full_len;
+ }
start:
- ingress = tcp_skb_bpf_ingress(skb);
+ ingress = skb_bpf_ingress(skb);
+ skb_bpf_redirect_clear(skb);
do {
ret = -EIO;
- if (likely(psock->sk->sk_socket))
+ if (!sock_flag(psock->sk, SOCK_DEAD))
ret = sk_psock_handle_skb(psock, skb, off,
len, ingress);
if (ret <= 0) {
if (ret == -EAGAIN) {
- state->skb = skb;
- state->len = len;
- state->off = off;
+ sk_psock_skb_state(psock, state, skb,
+ len, off);
goto end;
}
/* Hard errors break pipe and stop xmit. */
sk_psock_report_error(psock, ret ? -ret : EPIPE);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
- kfree_skb(skb);
+ sock_drop(psock->sk, skb);
goto end;
}
off += ret;
@@ -488,33 +695,60 @@ start:
kfree_skb(skb);
}
end:
- release_sock(psock->sk);
+ mutex_unlock(&psock->work_mutex);
}
struct sk_psock *sk_psock_init(struct sock *sk, int node)
{
- struct sk_psock *psock = kzalloc_node(sizeof(*psock),
- GFP_ATOMIC | __GFP_NOWARN,
- node);
- if (!psock)
- return NULL;
+ struct sk_psock *psock;
+ struct proto *prot;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) {
+ psock = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ if (sk->sk_user_data) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node);
+ if (!psock) {
+ psock = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ prot = READ_ONCE(sk->sk_prot);
psock->sk = sk;
- psock->eval = __SK_NONE;
+ psock->eval = __SK_NONE;
+ psock->sk_proto = prot;
+ psock->saved_unhash = prot->unhash;
+ psock->saved_destroy = prot->destroy;
+ psock->saved_close = prot->close;
+ psock->saved_write_space = sk->sk_write_space;
INIT_LIST_HEAD(&psock->link);
spin_lock_init(&psock->link_lock);
INIT_WORK(&psock->work, sk_psock_backlog);
+ mutex_init(&psock->work_mutex);
INIT_LIST_HEAD(&psock->ingress_msg);
+ spin_lock_init(&psock->ingress_lock);
skb_queue_head_init(&psock->ingress_skb);
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
refcount_set(&psock->refcnt, 1);
- rcu_assign_sk_user_data(sk, psock);
+ __rcu_assign_sk_user_data_with_flags(sk, psock,
+ SK_USER_DATA_NOCOPY |
+ SK_USER_DATA_PSOCK);
sock_hold(sk);
+out:
+ write_unlock_bh(&sk->sk_callback_lock);
return psock;
}
EXPORT_SYMBOL_GPL(sk_psock_init);
@@ -532,7 +766,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
return link;
}
-void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
{
struct sk_msg *msg, *tmp;
@@ -543,9 +777,19 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
}
}
-static void sk_psock_zap_ingress(struct sk_psock *psock)
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
{
- __skb_queue_purge(&psock->ingress_skb);
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
+ skb_bpf_redirect_clear(skb);
+ sock_drop(psock->sk, skb);
+ }
+ kfree_skb(psock->work_state.skb);
+ /* We null the skb here to ensure that calls to sk_psock_backlog
+ * do not pick up the free'd skb.
+ */
+ psock->work_state.skb = NULL;
__sk_psock_purge_ingress_msg(psock);
}
@@ -559,23 +803,32 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
}
}
-static void sk_psock_destroy_deferred(struct work_struct *gc)
+void sk_psock_stop(struct sk_psock *psock)
{
- struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
+ spin_lock_bh(&psock->ingress_lock);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ sk_psock_cork_free(psock);
+ __sk_psock_zap_ingress(psock);
+ spin_unlock_bh(&psock->ingress_lock);
+}
+static void sk_psock_done_strp(struct sk_psock *psock);
+
+static void sk_psock_destroy(struct work_struct *work)
+{
+ struct sk_psock *psock = container_of(to_rcu_work(work),
+ struct sk_psock, rwork);
/* No sk_callback_lock since already detached. */
- /* Parser has been stopped */
- if (psock->progs.skb_parser)
- strp_done(&psock->parser.strp);
+ sk_psock_done_strp(psock);
cancel_work_sync(&psock->work);
+ mutex_destroy(&psock->work_mutex);
psock_progs_drop(&psock->progs);
sk_psock_link_destroy(psock);
sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
if (psock->sk_redir)
sock_put(psock->sk_redir);
@@ -583,29 +836,21 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)
kfree(psock);
}
-void sk_psock_destroy(struct rcu_head *rcu)
-{
- struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
-
- INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
- schedule_work(&psock->gc);
-}
-EXPORT_SYMBOL_GPL(sk_psock_destroy);
-
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
- sk_psock_cork_free(psock);
- sk_psock_zap_ingress(psock);
-
write_lock_bh(&sk->sk_callback_lock);
sk_psock_restore_proto(sk, psock);
rcu_assign_sk_user_data(sk, NULL);
- if (psock->progs.skb_parser)
+ if (psock->progs.stream_parser)
sk_psock_stop_strp(sk, psock);
+ else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
+ sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
- sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
- call_rcu(&psock->rcu, sk_psock_destroy);
+ sk_psock_stop(psock);
+
+ INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
+ queue_rcu_work(system_wq, &psock->rwork);
}
EXPORT_SYMBOL_GPL(sk_psock_drop);
@@ -628,7 +873,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
struct bpf_prog *prog;
int ret;
- preempt_disable();
rcu_read_lock();
prog = READ_ONCE(psock->progs.msg_parser);
if (unlikely(!prog)) {
@@ -638,7 +882,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
sk_msg_compute_data_pointers(msg);
msg->sk = sk;
- ret = BPF_PROG_RUN(prog, msg);
+ ret = bpf_prog_run_pin_on_cpu(prog, msg);
ret = sk_psock_map_verd(ret, msg->sk_redir);
psock->apply_bytes = msg->apply_bytes;
if (ret == __SK_REDIRECT) {
@@ -653,107 +897,191 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
}
out:
rcu_read_unlock();
- preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
-static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
- struct sk_buff *skb)
+static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
{
- int ret;
+ struct sk_psock *psock_other;
+ struct sock *sk_other;
- skb->sk = psock->sk;
- bpf_compute_data_end_sk_skb(skb);
- preempt_disable();
- ret = BPF_PROG_RUN(prog, skb);
- preempt_enable();
- /* strparser clones the skb before handing it to a upper layer,
- * meaning skb_orphan has been called. We NULL sk on the way out
- * to ensure we don't trigger a BUG_ON() in skb/sk operations
- * later and because we are not charging the memory of this skb
- * to any socket yet.
+ sk_other = skb_bpf_redirect_fetch(skb);
+ /* This error is a buggy BPF program, it returned a redirect
+ * return code, but then didn't set a redirect interface.
*/
- skb->sk = NULL;
- return ret;
+ if (unlikely(!sk_other)) {
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
+ }
+ psock_other = sk_psock(sk_other);
+ /* This error indicates the socket is being torn down or had another
+ * error that caused the pipe to break. We can't send a packet on
+ * a socket that is in this state so we drop the skb.
+ */
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
+ }
+ spin_lock_bh(&psock_other->ingress_lock);
+ if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ spin_unlock_bh(&psock_other->ingress_lock);
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
+ }
+
+ skb_queue_tail(&psock_other->ingress_skb, skb);
+ schedule_work(&psock_other->work);
+ spin_unlock_bh(&psock_other->ingress_lock);
+ return 0;
+}
+
+static void sk_psock_tls_verdict_apply(struct sk_buff *skb,
+ struct sk_psock *from, int verdict)
+{
+ switch (verdict) {
+ case __SK_REDIRECT:
+ sk_psock_skb_redirect(from, skb);
+ break;
+ case __SK_PASS:
+ case __SK_DROP:
+ default:
+ break;
+ }
}
-static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
+int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
{
- struct sk_psock_parser *parser;
+ struct bpf_prog *prog;
+ int ret = __SK_PASS;
- parser = container_of(strp, struct sk_psock_parser, strp);
- return container_of(parser, struct sk_psock, parser);
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (likely(prog)) {
+ skb->sk = psock->sk;
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
+ skb->sk = NULL;
+ }
+ sk_psock_tls_verdict_apply(skb, psock, ret);
+ rcu_read_unlock();
+ return ret;
}
+EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
-static void sk_psock_verdict_apply(struct sk_psock *psock,
- struct sk_buff *skb, int verdict)
+static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
+ int verdict)
{
- struct sk_psock *psock_other;
struct sock *sk_other;
- bool ingress;
+ int err = 0;
+ u32 len, off;
switch (verdict) {
case __SK_PASS:
+ err = -EIO;
sk_other = psock->sk;
if (sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ skb_bpf_redirect_clear(skb);
goto out_free;
}
- if (atomic_read(&sk_other->sk_rmem_alloc) <=
- sk_other->sk_rcvbuf) {
- struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
- tcp->bpf.flags |= BPF_F_INGRESS;
- skb_queue_tail(&psock->ingress_skb, skb);
- schedule_work(&psock->work);
- break;
+ skb_bpf_set_ingress(skb);
+
+ /* If the queue is empty then we can submit directly
+ * into the msg queue. If its not empty we have to
+ * queue work otherwise we may get OOO data. Otherwise,
+ * if sk_psock_skb_ingress errors will be handled by
+ * retrying later from workqueue.
+ */
+ if (skb_queue_empty(&psock->ingress_skb)) {
+ len = skb->len;
+ off = 0;
+ if (skb_bpf_strparser(skb)) {
+ struct strp_msg *stm = strp_msg(skb);
+
+ off = stm->offset;
+ len = stm->full_len;
+ }
+ err = sk_psock_skb_ingress_self(psock, skb, off, len);
}
- goto out_free;
- case __SK_REDIRECT:
- sk_other = tcp_skb_bpf_redirect_fetch(skb);
- if (unlikely(!sk_other))
- goto out_free;
- psock_other = sk_psock(sk_other);
- if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
- !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
- goto out_free;
- ingress = tcp_skb_bpf_ingress(skb);
- if ((!ingress && sock_writeable(sk_other)) ||
- (ingress &&
- atomic_read(&sk_other->sk_rmem_alloc) <=
- sk_other->sk_rcvbuf)) {
- if (!ingress)
- skb_set_owner_w(skb, sk_other);
- skb_queue_tail(&psock_other->ingress_skb, skb);
- schedule_work(&psock_other->work);
- break;
+ if (err < 0) {
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ skb_queue_tail(&psock->ingress_skb, skb);
+ schedule_work(&psock->work);
+ err = 0;
+ }
+ spin_unlock_bh(&psock->ingress_lock);
+ if (err < 0) {
+ skb_bpf_redirect_clear(skb);
+ goto out_free;
+ }
}
- /* fall-through */
+ break;
+ case __SK_REDIRECT:
+ err = sk_psock_skb_redirect(psock, skb);
+ break;
case __SK_DROP:
- /* fall-through */
default:
out_free:
- kfree_skb(skb);
+ sock_drop(psock->sk, skb);
}
+
+ return err;
}
+static void sk_psock_write_space(struct sock *sk)
+{
+ struct sk_psock *psock;
+ void (*write_space)(struct sock *sk) = NULL;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_work(&psock->work);
+ write_space = psock->saved_write_space;
+ }
+ rcu_read_unlock();
+ if (write_space)
+ write_space(sk);
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
{
- struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct sk_psock *psock;
struct bpf_prog *prog;
int ret = __SK_DROP;
+ struct sock *sk;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_verdict);
+ sk = strp->sk;
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ sock_drop(sk, skb);
+ goto out;
+ }
+ prog = READ_ONCE(psock->progs.stream_verdict);
if (likely(prog)) {
- skb_orphan(skb);
- tcp_skb_bpf_redirect_clear(skb);
- ret = sk_psock_bpf_run(psock, prog, skb);
- ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+ skb->sk = sk;
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ if (ret == SK_PASS)
+ skb_bpf_set_strparser(skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
+ skb->sk = NULL;
}
- rcu_read_unlock();
sk_psock_verdict_apply(psock, skb, ret);
+out:
+ rcu_read_unlock();
}
static int sk_psock_strp_read_done(struct strparser *strp, int err)
@@ -763,14 +1091,17 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err)
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
{
- struct sk_psock *psock = sk_psock_from_strp(strp);
+ struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
struct bpf_prog *prog;
int ret = skb->len;
rcu_read_lock();
- prog = READ_ONCE(psock->progs.skb_parser);
- if (likely(prog))
- ret = sk_psock_bpf_run(psock, prog, skb);
+ prog = READ_ONCE(psock->progs.stream_parser);
+ if (likely(prog)) {
+ skb->sk = psock->sk;
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ skb->sk = NULL;
+ }
rcu_read_unlock();
return ret;
}
@@ -783,28 +1114,15 @@ static void sk_psock_strp_data_ready(struct sock *sk)
rcu_read_lock();
psock = sk_psock(sk);
if (likely(psock)) {
- write_lock_bh(&sk->sk_callback_lock);
- strp_data_ready(&psock->parser.strp);
- write_unlock_bh(&sk->sk_callback_lock);
- }
- rcu_read_unlock();
-}
-
-static void sk_psock_write_space(struct sock *sk)
-{
- struct sk_psock *psock;
- void (*write_space)(struct sock *sk) = NULL;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (likely(psock)) {
- if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
- schedule_work(&psock->work);
- write_space = psock->saved_write_space;
+ if (tls_sw_has_ctx_rx(sk)) {
+ psock->saved_data_ready(sk);
+ } else {
+ write_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->strp);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
}
rcu_read_unlock();
- if (write_space)
- write_space(sk);
}
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
@@ -815,32 +1133,103 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
.parse_msg = sk_psock_strp_parse,
};
- psock->parser.enabled = false;
- return strp_init(&psock->parser.strp, sk, &cb);
+ return strp_init(&psock->strp, sk, &cb);
}
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
-
- if (parser->enabled)
+ if (psock->saved_data_ready)
return;
- parser->saved_data_ready = sk->sk_data_ready;
+ psock->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_strp_data_ready;
sk->sk_write_space = sk_psock_write_space;
- parser->enabled = true;
}
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock_parser *parser = &psock->parser;
+ psock_set_prog(&psock->progs.stream_parser, NULL);
+
+ if (!psock->saved_data_ready)
+ return;
+
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
+ strp_stop(&psock->strp);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+ /* Parser has been stopped */
+ if (psock->progs.stream_parser)
+ strp_done(&psock->strp);
+}
+#else
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
+static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_psock *psock;
+ struct bpf_prog *prog;
+ int ret = __SK_DROP;
+ int len = skb->len;
+
+ skb_get(skb);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ len = 0;
+ sock_drop(sk, skb);
+ goto out;
+ }
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (!prog)
+ prog = READ_ONCE(psock->progs.skb_verdict);
+ if (likely(prog)) {
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
+ }
+ ret = sk_psock_verdict_apply(psock, skb, ret);
+ if (ret < 0)
+ len = ret;
+out:
+ rcu_read_unlock();
+ return len;
+}
+
+static void sk_psock_verdict_data_ready(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (unlikely(!sock || !sock->ops || !sock->ops->read_skb))
+ return;
+ sock->ops->read_skb(sk, sk_psock_verdict_recv);
+}
+
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
+{
+ if (psock->saved_data_ready)
+ return;
+
+ psock->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_verdict_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+}
+
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
+{
+ psock_set_prog(&psock->progs.stream_verdict, NULL);
+ psock_set_prog(&psock->progs.skb_verdict, NULL);
- if (!parser->enabled)
+ if (!psock->saved_data_ready)
return;
- sk->sk_data_ready = parser->saved_data_ready;
- parser->saved_data_ready = NULL;
- strp_stop(&parser->strp);
- parser->enabled = false;
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 8f71684305c3..a3ba0358c77c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -113,6 +113,7 @@
#include <linux/static_key.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
@@ -138,10 +139,15 @@
#include <net/tcp.h>
#include <net/busy_poll.h>
+#include <linux/ethtool.h>
+
+#include "dev.h"
+
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
-static void sock_inuse_add(struct net *net, int val);
+static void sock_def_write_space_wfree(struct sock *sk);
+static void sock_def_write_space(struct sock *sk);
/**
* sk_ns_capable - General socket capability test
@@ -223,6 +229,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
+ x "AF_MCTP" , \
x "AF_MAX"
static const char *const af_family_key_strings[AF_MAX+1] = {
@@ -323,14 +330,33 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
noreclaim_flag = memalloc_noreclaim_save();
- ret = sk->sk_backlog_rcv(sk, skb);
+ ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
+ tcp_v6_do_rcv,
+ tcp_v4_do_rcv,
+ sk, skb);
memalloc_noreclaim_restore(noreclaim_flag);
return ret;
}
EXPORT_SYMBOL(__sk_backlog_rcv);
-static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
+void sk_error_report(struct sock *sk)
+{
+ sk->sk_error_report(sk);
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ fallthrough;
+ case AF_INET6:
+ trace_inet_sk_error_report(sk);
+ break;
+ default:
+ break;
+ }
+}
+EXPORT_SYMBOL(sk_error_report);
+
+int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
struct __kernel_sock_timeval tv;
@@ -359,36 +385,50 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
*(struct __kernel_sock_timeval *)optval = tv;
return sizeof(tv);
}
+EXPORT_SYMBOL(sock_get_timeout);
-static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval)
+int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
+ sockptr_t optval, int optlen, bool old_timeval)
{
- struct __kernel_sock_timeval tv;
-
if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
struct old_timeval32 tv32;
if (optlen < sizeof(tv32))
return -EINVAL;
- if (copy_from_user(&tv32, optval, sizeof(tv32)))
+ if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
return -EFAULT;
- tv.tv_sec = tv32.tv_sec;
- tv.tv_usec = tv32.tv_usec;
+ tv->tv_sec = tv32.tv_sec;
+ tv->tv_usec = tv32.tv_usec;
} else if (old_timeval) {
struct __kernel_old_timeval old_tv;
if (optlen < sizeof(old_tv))
return -EINVAL;
- if (copy_from_user(&old_tv, optval, sizeof(old_tv)))
+ if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
return -EFAULT;
- tv.tv_sec = old_tv.tv_sec;
- tv.tv_usec = old_tv.tv_usec;
+ tv->tv_sec = old_tv.tv_sec;
+ tv->tv_usec = old_tv.tv_usec;
} else {
- if (optlen < sizeof(tv))
+ if (optlen < sizeof(*tv))
return -EINVAL;
- if (copy_from_user(&tv, optval, sizeof(tv)))
+ if (copy_from_sockptr(tv, optval, sizeof(*tv)))
return -EFAULT;
}
+
+ return 0;
+}
+EXPORT_SYMBOL(sock_copy_user_timeval);
+
+static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
+ bool old_timeval)
+{
+ struct __kernel_sock_timeval tv;
+ int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
+
+ if (err)
+ return err;
+
if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
return -EDOM;
@@ -411,18 +451,6 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool
return 0;
}
-static void sock_warn_obsolete_bsdism(const char *name)
-{
- static int warned;
- static char warncomm[TASK_COMM_LEN];
- if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
- pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
- warncomm, name);
- warned++;
- }
-}
-
static bool sock_needs_netstamp(const struct sock *sk)
{
switch (sk->sk_family) {
@@ -480,17 +508,35 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(__sock_queue_rcv_skb);
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason)
{
+ enum skb_drop_reason drop_reason;
int err;
err = sk_filter(sk, skb);
- if (err)
- return err;
-
- return __sock_queue_rcv_skb(sk, skb);
+ if (err) {
+ drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+ goto out;
+ }
+ err = __sock_queue_rcv_skb(sk, skb);
+ switch (err) {
+ case -ENOMEM:
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ break;
+ case -ENOBUFS:
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ break;
+ default:
+ drop_reason = SKB_NOT_DROPPED_YET;
+ break;
+ }
+out:
+ if (reason)
+ *reason = drop_reason;
+ return err;
}
-EXPORT_SYMBOL(sock_queue_rcv_skb);
+EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
const int nested, unsigned int trim_cap, bool refcounted)
@@ -536,11 +582,17 @@ discard_and_relse:
}
EXPORT_SYMBOL(__sk_receive_skb);
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
+ u32));
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = __sk_dst_get(sk);
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (dst && dst->obsolete &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
sk->sk_dst_pending_confirm = 0;
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
@@ -556,7 +608,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = sk_dst_get(sk);
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (dst && dst->obsolete &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
sk_dst_reset(sk);
dst_release(dst);
return NULL;
@@ -566,7 +620,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
}
EXPORT_SYMBOL(sk_dst_check);
-static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
+static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
@@ -574,14 +628,16 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
/* Sorry... */
ret = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
goto out;
ret = -EINVAL;
if (ifindex < 0)
goto out;
- sk->sk_bound_dev_if = ifindex;
+ /* Paired with all READ_ONCE() done locklessly. */
+ WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
+
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
@@ -594,8 +650,21 @@ out:
return ret;
}
-static int sock_setbindtodevice(struct sock *sk, char __user *optval,
- int optlen)
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
+{
+ int ret;
+
+ if (lock_sk)
+ lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, ifindex);
+ if (lock_sk)
+ release_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL(sock_bindtoindex);
+
+static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
@@ -617,7 +686,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
memset(devname, 0, sizeof(devname));
ret = -EFAULT;
- if (copy_from_user(devname, optval, optlen))
+ if (copy_from_sockptr(devname, optval, optlen))
goto out;
index = 0;
@@ -634,25 +703,25 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
goto out;
}
- lock_sock(sk);
- ret = sock_setbindtodevice_locked(sk, index);
- release_sock(sk);
-
+ sockopt_lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, index);
+ sockopt_release_sock(sk);
out:
#endif
return ret;
}
-static int sock_getbindtodevice(struct sock *sk, char __user *optval,
- int __user *optlen, int len)
+static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
+ int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
struct net *net = sock_net(sk);
char devname[IFNAMSIZ];
- if (sk->sk_bound_dev_if == 0) {
+ if (bound_dev_if == 0) {
len = 0;
goto zero;
}
@@ -661,19 +730,19 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
if (len < IFNAMSIZ)
goto out;
- ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+ ret = netdev_get_name(net, devname, bound_dev_if);
if (ret)
goto out;
len = strlen(devname) + 1;
ret = -EFAULT;
- if (copy_to_user(optval, devname, len))
+ if (copy_to_sockptr(optval, devname, len))
goto out;
zero:
ret = -EFAULT;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
goto out;
ret = 0;
@@ -684,15 +753,6 @@ out:
return ret;
}
-static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
- int valbool)
-{
- if (valbool)
- sock_set_flag(sk, bit);
- else
- sock_reset_flag(sk, bit);
-}
-
bool sk_mc_loop(struct sock *sk)
{
if (dev_recursion_level())
@@ -707,21 +767,322 @@ bool sk_mc_loop(struct sock *sk)
return inet6_sk(sk)->mc_loop;
#endif
}
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return true;
}
EXPORT_SYMBOL(sk_mc_loop);
+void sock_set_reuseaddr(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuse = SK_CAN_REUSE;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseaddr);
+
+void sock_set_reuseport(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuseport = true;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseport);
+
+void sock_no_linger(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_lingertime = 0;
+ sock_set_flag(sk, SOCK_LINGER);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_no_linger);
+
+void sock_set_priority(struct sock *sk, u32 priority)
+{
+ lock_sock(sk);
+ sk->sk_priority = priority;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_priority);
+
+void sock_set_sndtimeo(struct sock *sk, s64 secs)
+{
+ lock_sock(sk);
+ if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
+ sk->sk_sndtimeo = secs * HZ;
+ else
+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_sndtimeo);
+
+static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
+{
+ if (val) {
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
+ sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ } else {
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+ }
+}
+
+void sock_enable_timestamps(struct sock *sk)
+{
+ lock_sock(sk);
+ __sock_set_timestamps(sk, true, false, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_enable_timestamps);
+
+void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
+{
+ switch (optname) {
+ case SO_TIMESTAMP_OLD:
+ __sock_set_timestamps(sk, valbool, false, false);
+ break;
+ case SO_TIMESTAMP_NEW:
+ __sock_set_timestamps(sk, valbool, true, false);
+ break;
+ case SO_TIMESTAMPNS_OLD:
+ __sock_set_timestamps(sk, valbool, false, true);
+ break;
+ case SO_TIMESTAMPNS_NEW:
+ __sock_set_timestamps(sk, valbool, true, true);
+ break;
+ }
+}
+
+static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev = NULL;
+ bool match = false;
+ int *vclock_index;
+ int i, num;
+
+ if (sk->sk_bound_dev_if)
+ dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+
+ if (!dev) {
+ pr_err("%s: sock not bind to device\n", __func__);
+ return -EOPNOTSUPP;
+ }
+
+ num = ethtool_get_phc_vclocks(dev, &vclock_index);
+ dev_put(dev);
+
+ for (i = 0; i < num; i++) {
+ if (*(vclock_index + i) == phc_index) {
+ match = true;
+ break;
+ }
+ }
+
+ if (num > 0)
+ kfree(vclock_index);
+
+ if (!match)
+ return -EINVAL;
+
+ sk->sk_bind_phc = phc_index;
+
+ return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+ struct so_timestamping timestamping)
+{
+ int val = timestamping.flags;
+ int ret;
+
+ if (val & ~SOF_TIMESTAMPING_MASK)
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_OPT_ID &&
+ !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ if (sk_is_tcp(sk)) {
+ if ((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN))
+ return -EINVAL;
+ atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
+ } else {
+ atomic_set(&sk->sk_tskey, 0);
+ }
+ }
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY))
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_BIND_PHC) {
+ ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
+ if (ret)
+ return ret;
+ }
+
+ sk->sk_tsflags = val;
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ else
+ sock_disable_timestamp(sk,
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ return 0;
+}
+
+void sock_set_keepalive(struct sock *sk)
+{
+ lock_sock(sk);
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, true);
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_keepalive);
+
+static void __sock_set_rcvbuf(struct sock *sk, int val)
+{
+ /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
+ * as a negative value.
+ */
+ val = min_t(int, val, INT_MAX / 2);
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+
+ /* We double it on the way in to account for "struct sk_buff" etc.
+ * overhead. Applications assume that the SO_RCVBUF setting they make
+ * will allow that much actual data to be received on that socket.
+ *
+ * Applications are unaware that "struct sk_buff" and other overheads
+ * allocate from the receive buffer during socket buffer allocation.
+ *
+ * And after considering the possible alternatives, returning the value
+ * we actually used in getsockopt is the most desirable behavior.
+ */
+ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
+}
+
+void sock_set_rcvbuf(struct sock *sk, int val)
+{
+ lock_sock(sk);
+ __sock_set_rcvbuf(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_rcvbuf);
+
+static void __sock_set_mark(struct sock *sk, u32 val)
+{
+ if (val != sk->sk_mark) {
+ sk->sk_mark = val;
+ sk_dst_reset(sk);
+ }
+}
+
+void sock_set_mark(struct sock *sk, u32 val)
+{
+ lock_sock(sk);
+ __sock_set_mark(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_mark);
+
+static void sock_release_reserved_memory(struct sock *sk, int bytes)
+{
+ /* Round down bytes to multiple of pages */
+ bytes = round_down(bytes, PAGE_SIZE);
+
+ WARN_ON(bytes > sk->sk_reserved_mem);
+ sk->sk_reserved_mem -= bytes;
+ sk_mem_reclaim(sk);
+}
+
+static int sock_reserve_memory(struct sock *sk, int bytes)
+{
+ long allocated;
+ bool charged;
+ int pages;
+
+ if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
+ return -EOPNOTSUPP;
+
+ if (!bytes)
+ return 0;
+
+ pages = sk_mem_pages(bytes);
+
+ /* pre-charge to memcg */
+ charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!charged)
+ return -ENOMEM;
+
+ /* pre-charge to forward_alloc */
+ sk_memory_allocated_add(sk, pages);
+ allocated = sk_memory_allocated(sk);
+ /* If the system goes into memory pressure with this
+ * precharge, give up and return error.
+ */
+ if (allocated > sk_prot_mem_limits(sk, 1)) {
+ sk_memory_allocated_sub(sk, pages);
+ mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
+ return -ENOMEM;
+ }
+ sk->sk_forward_alloc += pages << PAGE_SHIFT;
+
+ sk->sk_reserved_mem += pages << PAGE_SHIFT;
+
+ return 0;
+}
+
+void sockopt_lock_sock(struct sock *sk)
+{
+ /* When current->bpf_ctx is set, the setsockopt is called from
+ * a bpf prog. bpf has ensured the sk lock has been
+ * acquired before calling setsockopt().
+ */
+ if (has_current_bpf_ctx())
+ return;
+
+ lock_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_lock_sock);
+
+void sockopt_release_sock(struct sock *sk)
+{
+ if (has_current_bpf_ctx())
+ return;
+
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_release_sock);
+
+bool sockopt_ns_capable(struct user_namespace *ns, int cap)
+{
+ return has_current_bpf_ctx() || ns_capable(ns, cap);
+}
+EXPORT_SYMBOL(sockopt_ns_capable);
+
+bool sockopt_capable(int cap)
+{
+ return has_current_bpf_ctx() || capable(cap);
+}
+EXPORT_SYMBOL(sockopt_capable);
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
*/
-int sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+int sk_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
+ struct so_timestamping timestamping;
+ struct socket *sock = sk->sk_socket;
struct sock_txtime sk_txtime;
- struct sock *sk = sock->sk;
int val;
int valbool;
struct linger ling;
@@ -737,16 +1098,16 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
valbool = val ? 1 : 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case SO_DEBUG:
- if (val && !capable(CAP_NET_ADMIN))
+ if (val && !sockopt_capable(CAP_NET_ADMIN))
ret = -EACCES;
else
sock_valbool_flag(sk, SOCK_DBG, valbool);
@@ -776,7 +1137,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_wmem_max);
+ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
/* Ensure val * 2 fits into an int, to prevent max_t()
* from treating it as a negative value.
@@ -790,7 +1151,7 @@ set_sndbuf:
break;
case SO_SNDBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -808,34 +1169,11 @@ set_sndbuf:
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_rmem_max);
-set_rcvbuf:
- /* Ensure val * 2 fits into an int, to prevent max_t()
- * from treating it as a negative value.
- */
- val = min_t(int, val, INT_MAX / 2);
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- /*
- * We double it on the way in to account for
- * "struct sk_buff" etc. overhead. Applications
- * assume that the SO_RCVBUF setting they make will
- * allow that much actual data to be received on that
- * socket.
- *
- * Applications are unaware that "struct sk_buff" and
- * other overheads allocate from the receive buffer
- * during socket buffer allocation.
- *
- * And after considering the possible alternatives,
- * returning the value we actually used in getsockopt
- * is the most desirable behavior.
- */
- WRITE_ONCE(sk->sk_rcvbuf,
- max_t(int, val * 2, SOCK_MIN_RCVBUF));
+ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
break;
case SO_RCVBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -843,9 +1181,8 @@ set_rcvbuf:
/* No negative values (to prevent underflow, as val will be
* multiplied by 2).
*/
- if (val < 0)
- val = 0;
- goto set_rcvbuf;
+ __sock_set_rcvbuf(sk, max(val, 0));
+ break;
case SO_KEEPALIVE:
if (sk->sk_prot->keepalive)
@@ -863,7 +1200,8 @@ set_rcvbuf:
case SO_PRIORITY:
if ((val >= 0 && val <= 6) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
sk->sk_priority = val;
else
ret = -EPERM;
@@ -874,7 +1212,7 @@ set_rcvbuf:
ret = -EINVAL; /* 1003.1g */
break;
}
- if (copy_from_user(&ling, optval, sizeof(ling))) {
+ if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
ret = -EFAULT;
break;
}
@@ -892,7 +1230,6 @@ set_rcvbuf:
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("setsockopt");
break;
case SO_PASSCRED:
@@ -906,72 +1243,28 @@ set_rcvbuf:
case SO_TIMESTAMP_NEW:
case SO_TIMESTAMPNS_OLD:
case SO_TIMESTAMPNS_NEW:
- if (valbool) {
- if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW)
- sock_set_flag(sk, SOCK_TSTAMP_NEW);
- else
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
- if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- else
- sock_set_flag(sk, SOCK_RCVTSTAMPNS);
- sock_set_flag(sk, SOCK_RCVTSTAMP);
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- } else {
- sock_reset_flag(sk, SOCK_RCVTSTAMP);
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
- }
+ sock_set_timestamp(sk, optname, valbool);
break;
case SO_TIMESTAMPING_NEW:
- sock_set_flag(sk, SOCK_TSTAMP_NEW);
- /* fall through */
case SO_TIMESTAMPING_OLD:
- if (val & ~SOF_TIMESTAMPING_MASK) {
- ret = -EINVAL;
- break;
- }
-
- if (val & SOF_TIMESTAMPING_OPT_ID &&
- !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
- if ((1 << sk->sk_state) &
- (TCPF_CLOSE | TCPF_LISTEN)) {
- ret = -EINVAL;
- break;
- }
- sk->sk_tskey = tcp_sk(sk)->snd_una;
- } else {
- sk->sk_tskey = 0;
+ if (optlen == sizeof(timestamping)) {
+ if (copy_from_sockptr(&timestamping, optval,
+ sizeof(timestamping))) {
+ ret = -EFAULT;
+ break;
}
+ } else {
+ memset(&timestamping, 0, sizeof(timestamping));
+ timestamping.flags = val;
}
-
- if (val & SOF_TIMESTAMPING_OPT_STATS &&
- !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
- ret = -EINVAL;
- break;
- }
-
- sk->sk_tsflags = val;
- if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
- sock_enable_timestamp(sk,
- SOCK_TIMESTAMPING_RX_SOFTWARE);
- else {
- if (optname == SO_TIMESTAMPING_NEW)
- sock_reset_flag(sk, SOCK_TSTAMP_NEW);
-
- sock_disable_timestamp(sk,
- (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
- }
+ ret = sock_set_timestamping(sk, optname, timestamping);
break;
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
- if (sock->ops->set_rcvlowat)
+ if (sock && sock->ops->set_rcvlowat)
ret = sock->ops->set_rcvlowat(sk, val);
else
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
@@ -979,60 +1272,52 @@ set_rcvbuf:
case SO_RCVTIMEO_OLD:
case SO_RCVTIMEO_NEW:
- ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);
+ ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
+ optlen, optname == SO_RCVTIMEO_OLD);
break;
case SO_SNDTIMEO_OLD:
case SO_SNDTIMEO_NEW:
- ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);
+ ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
+ optlen, optname == SO_SNDTIMEO_OLD);
break;
- case SO_ATTACH_FILTER:
- ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
-
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
+ case SO_ATTACH_FILTER: {
+ struct sock_fprog fprog;
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
ret = sk_attach_filter(&fprog, sk);
- }
break;
-
+ }
case SO_ATTACH_BPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
- if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
break;
ret = sk_attach_bpf(ufd, sk);
}
break;
- case SO_ATTACH_REUSEPORT_CBPF:
- ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
-
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
+ case SO_ATTACH_REUSEPORT_CBPF: {
+ struct sock_fprog fprog;
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
ret = sk_reuseport_attach_filter(&fprog, sk);
- }
break;
-
+ }
case SO_ATTACH_REUSEPORT_EBPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
- if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
break;
ret = sk_reuseport_attach_bpf(ufd, sk);
@@ -1061,12 +1346,22 @@ set_rcvbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+
+ __sock_set_mark(sk, val);
+ break;
+ case SO_RCVMARK:
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
- } else if (val != sk->sk_mark) {
- sk->sk_mark = val;
- sk_dst_reset(sk);
+ break;
}
+
+ sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
break;
case SO_RXQ_OVFL:
@@ -1095,24 +1390,40 @@ set_rcvbuf:
#ifdef CONFIG_NET_RX_BUSY_POLL
case SO_BUSY_POLL:
/* allow unprivileged users to decrease the value */
- if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN))
ret = -EPERM;
else {
if (val < 0)
ret = -EINVAL;
else
- sk->sk_ll_usec = val;
+ WRITE_ONCE(sk->sk_ll_usec, val);
+ }
+ break;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ break;
+ case SO_BUSY_POLL_BUDGET:
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ } else {
+ if (val < 0 || val > U16_MAX)
+ ret = -EINVAL;
+ else
+ WRITE_ONCE(sk->sk_busy_poll_budget, val);
}
break;
#endif
case SO_MAX_PACING_RATE:
{
- unsigned long ulval = (val == ~0U) ? ~0UL : val;
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
if (sizeof(ulval) != sizeof(val) &&
optlen >= sizeof(ulval) &&
- get_user(ulval, (unsigned long __user *)optval)) {
+ copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
ret = -EFAULT;
break;
}
@@ -1135,13 +1446,12 @@ set_rcvbuf:
case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
- if (!((sk->sk_type == SOCK_STREAM &&
- sk->sk_protocol == IPPROTO_TCP) ||
+ if (!(sk_is_tcp(sk) ||
(sk->sk_type == SOCK_DGRAM &&
sk->sk_protocol == IPPROTO_UDP)))
- ret = -ENOTSUPP;
+ ret = -EOPNOTSUPP;
} else if (sk->sk_family != PF_RDS) {
- ret = -ENOTSUPP;
+ ret = -EOPNOTSUPP;
}
if (!ret) {
if (val < 0 || val > 1)
@@ -1152,38 +1462,98 @@ set_rcvbuf:
break;
case SO_TXTIME:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
- ret = -EPERM;
- } else if (optlen != sizeof(struct sock_txtime)) {
+ if (optlen != sizeof(struct sock_txtime)) {
ret = -EINVAL;
- } else if (copy_from_user(&sk_txtime, optval,
+ break;
+ } else if (copy_from_sockptr(&sk_txtime, optval,
sizeof(struct sock_txtime))) {
ret = -EFAULT;
+ break;
} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
ret = -EINVAL;
- } else {
- sock_valbool_flag(sk, SOCK_TXTIME, true);
- sk->sk_clockid = sk_txtime.clockid;
- sk->sk_txtime_deadline_mode =
- !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
- sk->sk_txtime_report_errors =
- !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ break;
+ }
+ /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
+ * scheduler has enough safe guards.
+ */
+ if (sk_txtime.clockid != CLOCK_MONOTONIC &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
}
+ sock_valbool_flag(sk, SOCK_TXTIME, true);
+ sk->sk_clockid = sk_txtime.clockid;
+ sk->sk_txtime_deadline_mode =
+ !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+ sk->sk_txtime_report_errors =
+ !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
break;
case SO_BINDTOIFINDEX:
- ret = sock_setbindtodevice_locked(sk, val);
+ ret = sock_bindtoindex_locked(sk, val);
+ break;
+
+ case SO_BUF_LOCK:
+ if (val & ~SOCK_BUF_LOCK_MASK) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_userlocks = val | (sk->sk_userlocks &
+ ~SOCK_BUF_LOCK_MASK);
+ break;
+
+ case SO_RESERVE_MEM:
+ {
+ int delta;
+
+ if (val < 0) {
+ ret = -EINVAL;
+ break;
+ }
+
+ delta = val - sk->sk_reserved_mem;
+ if (delta < 0)
+ sock_release_reserved_memory(sk, -delta);
+ else
+ ret = sock_reserve_memory(sk, delta);
+ break;
+ }
+
+ case SO_TXREHASH:
+ if (val < -1 || val > 1) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Paired with READ_ONCE() in tcp_rtx_synack() */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
break;
default:
ret = -ENOPROTOOPT;
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ return sk_setsockopt(sock->sk, level, optname,
+ optval, optlen);
+}
EXPORT_SYMBOL(sock_setsockopt);
+static const struct cred *sk_get_peer_cred(struct sock *sk)
+{
+ const struct cred *cred;
+
+ spin_lock(&sk->sk_peer_lock);
+ cred = get_cred(sk->sk_peer_cred);
+ spin_unlock(&sk->sk_peer_lock);
+
+ return cred;
+}
static void cred_to_ucred(struct pid *pid, const struct cred *cred,
struct ucred *ucred)
@@ -1198,22 +1568,25 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,
}
}
-static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
struct user_namespace *user_ns = current_user_ns();
int i;
- for (i = 0; i < src->ngroups; i++)
- if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ for (i = 0; i < src->ngroups; i++) {
+ gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
+
+ if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
return -EFAULT;
+ }
return 0;
}
-int sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+int sk_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
- struct sock *sk = sock->sk;
+ struct socket *sock = sk->sk_socket;
union {
int val;
@@ -1224,12 +1597,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
struct __kernel_old_timeval tm;
struct __kernel_sock_timeval stm;
struct sock_txtime txtime;
+ struct so_timestamping timestamping;
} v;
int lv = sizeof(int);
int len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0)
return -EINVAL;
@@ -1306,7 +1680,6 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("getsockopt");
break;
case SO_TIMESTAMP_OLD:
@@ -1328,7 +1701,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_TIMESTAMPING_OLD:
- v.val = sk->sk_tsflags;
+ lv = sizeof(v.timestamping);
+ v.timestamping.flags = sk->sk_tsflags;
+ v.timestamping.bind_phc = sk->sk_bind_phc;
break;
case SO_RCVTIMEO_OLD:
@@ -1358,28 +1733,35 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
struct ucred peercred;
if (len > sizeof(peercred))
len = sizeof(peercred);
+
+ spin_lock(&sk->sk_peer_lock);
cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
- if (copy_to_user(optval, &peercred, len))
+ spin_unlock(&sk->sk_peer_lock);
+
+ if (copy_to_sockptr(optval, &peercred, len))
return -EFAULT;
goto lenout;
}
case SO_PEERGROUPS:
{
+ const struct cred *cred;
int ret, n;
- if (!sk->sk_peer_cred)
+ cred = sk_get_peer_cred(sk);
+ if (!cred)
return -ENODATA;
- n = sk->sk_peer_cred->group_info->ngroups;
+ n = cred->group_info->ngroups;
if (len < n * sizeof(gid_t)) {
len = n * sizeof(gid_t);
- return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ put_cred(cred);
+ return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
}
len = n * sizeof(gid_t);
- ret = groups_to_user((gid_t __user *)optval,
- sk->sk_peer_cred->group_info);
+ ret = groups_to_user(optval, cred->group_info);
+ put_cred(cred);
if (ret)
return ret;
goto lenout;
@@ -1394,7 +1776,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return -ENOTCONN;
if (lv < len)
return -EINVAL;
- if (copy_to_user(optval, address, len))
+ if (copy_to_sockptr(optval, address, len))
return -EFAULT;
goto lenout;
}
@@ -1411,12 +1793,16 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PEERSEC:
- return security_socket_getpeersec_stream(sock, optval, optlen, len);
+ return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len);
case SO_MARK:
v.val = sk->sk_mark;
break;
+ case SO_RCVMARK:
+ v.val = sock_flag(sk, SOCK_RCVMARK);
+ break;
+
case SO_RXQ_OVFL:
v.val = sock_flag(sk, SOCK_RXQ_OVFL);
break;
@@ -1439,7 +1825,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return sock_getbindtodevice(sk, optval, optlen, len);
case SO_GET_FILTER:
- len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ len = sk_get_filter(sk, optval, len);
if (len < 0)
return len;
@@ -1461,6 +1847,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BUSY_POLL:
v.val = sk->sk_ll_usec;
break;
+ case SO_PREFER_BUSY_POLL:
+ v.val = READ_ONCE(sk->sk_prefer_busy_poll);
+ break;
#endif
case SO_MAX_PACING_RATE:
@@ -1484,7 +1873,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
sk_get_meminfo(sk, meminfo);
len = min_t(unsigned int, len, sizeof(meminfo));
- if (copy_to_user(optval, &meminfo, len))
+ if (copy_to_sockptr(optval, &meminfo, len))
return -EFAULT;
goto lenout;
@@ -1522,7 +1911,26 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_BINDTOIFINDEX:
- v.val = sk->sk_bound_dev_if;
+ v.val = READ_ONCE(sk->sk_bound_dev_if);
+ break;
+
+ case SO_NETNS_COOKIE:
+ lv = sizeof(u64);
+ if (len != lv)
+ return -EINVAL;
+ v.val64 = sock_net(sk)->net_cookie;
+ break;
+
+ case SO_BUF_LOCK:
+ v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
+ break;
+
+ case SO_RESERVE_MEM:
+ v.val = sk->sk_reserved_mem;
+ break;
+
+ case SO_TXREHASH:
+ v.val = sk->sk_txrehash;
break;
default:
@@ -1534,14 +1942,22 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len > lv)
len = lv;
- if (copy_to_user(optval, &v, len))
+ if (copy_to_sockptr(optval, &v, len))
return -EFAULT;
lenout:
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
return 0;
}
+int sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return sk_getsockopt(sock->sk, level, optname,
+ USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen));
+}
+
/*
* Initialize an sk_lock.
*
@@ -1572,13 +1988,24 @@ static inline void sock_lock_init(struct sock *sk)
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
+ const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
#endif
+
+ /* If we move sk_tx_queue_mapping out of the private section,
+ * we must check if sk_tx_queue_clear() is called after
+ * sock_copy() in sk_clone_lock().
+ */
+ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
+ offsetof(struct sock, sk_dontcopy_begin) ||
+ offsetof(struct sock, sk_tx_queue_mapping) >=
+ offsetof(struct sock, sk_dontcopy_end));
+
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
- osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+ prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
#ifdef CONFIG_SECURITY_NETWORK
nsk->sk_security = sptr;
@@ -1608,7 +2035,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!try_module_get(prot->owner))
goto out_free_sec;
- sk_tx_queue_clear(sk);
}
return sk;
@@ -1666,7 +2092,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt)) {
- get_net(net);
+ get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
}
@@ -1677,6 +2103,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
+ sk_tx_queue_clear(sk);
}
return sk;
@@ -1716,11 +2143,12 @@ static void __sk_destruct(struct rcu_head *head)
sk->sk_frag.page = NULL;
}
- if (sk->sk_peer_cred)
- put_cred(sk->sk_peer_cred);
+ /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
+ put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
+
if (likely(sk->sk_net_refcnt))
- put_net(sock_net(sk));
+ put_net_track(sock_net(sk), &sk->ns_tracker);
sk_prot_free(sk->sk_prot_creator, sk);
}
@@ -1792,116 +2220,122 @@ static void sk_init_common(struct sock *sk)
*/
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
- struct sock *newsk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ struct sk_filter *filter;
bool is_charged = true;
+ struct sock *newsk;
- newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
- if (newsk != NULL) {
- struct sk_filter *filter;
-
- sock_copy(newsk, sk);
-
- newsk->sk_prot_creator = sk->sk_prot;
+ newsk = sk_prot_alloc(prot, priority, sk->sk_family);
+ if (!newsk)
+ goto out;
- /* SANITY */
- if (likely(newsk->sk_net_refcnt))
- get_net(sock_net(newsk));
- sk_node_init(&newsk->sk_node);
- sock_lock_init(newsk);
- bh_lock_sock(newsk);
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
- newsk->sk_backlog.len = 0;
+ sock_copy(newsk, sk);
- atomic_set(&newsk->sk_rmem_alloc, 0);
- /*
- * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
- */
- refcount_set(&newsk->sk_wmem_alloc, 1);
- atomic_set(&newsk->sk_omem_alloc, 0);
- sk_init_common(newsk);
+ newsk->sk_prot_creator = prot;
- newsk->sk_dst_cache = NULL;
- newsk->sk_dst_pending_confirm = 0;
- newsk->sk_wmem_queued = 0;
- newsk->sk_forward_alloc = 0;
- atomic_set(&newsk->sk_drops, 0);
- newsk->sk_send_head = NULL;
- newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
- atomic_set(&newsk->sk_zckey, 0);
+ /* SANITY */
+ if (likely(newsk->sk_net_refcnt)) {
+ get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
+ sock_inuse_add(sock_net(newsk), 1);
+ }
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+ bh_lock_sock(newsk);
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_backlog.len = 0;
- sock_reset_flag(newsk, SOCK_DONE);
+ atomic_set(&newsk->sk_rmem_alloc, 0);
- /* sk->sk_memcg will be populated at accept() time */
- newsk->sk_memcg = NULL;
+ /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
+ refcount_set(&newsk->sk_wmem_alloc, 1);
- cgroup_sk_alloc(&newsk->sk_cgrp_data);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ sk_init_common(newsk);
- rcu_read_lock();
- filter = rcu_dereference(sk->sk_filter);
- if (filter != NULL)
- /* though it's an empty new sock, the charging may fail
- * if sysctl_optmem_max was changed between creation of
- * original socket and cloning
- */
- is_charged = sk_filter_charge(newsk, filter);
- RCU_INIT_POINTER(newsk->sk_filter, filter);
- rcu_read_unlock();
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ newsk->sk_reserved_mem = 0;
+ atomic_set(&newsk->sk_drops, 0);
+ newsk->sk_send_head = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ atomic_set(&newsk->sk_zckey, 0);
- if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
- /* We need to make sure that we don't uncharge the new
- * socket if we couldn't charge it in the first place
- * as otherwise we uncharge the parent's filter.
- */
- if (!is_charged)
- RCU_INIT_POINTER(newsk->sk_filter, NULL);
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
- RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+ sock_reset_flag(newsk, SOCK_DONE);
- if (bpf_sk_storage_clone(sk, newsk)) {
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
- newsk->sk_err = 0;
- newsk->sk_err_soft = 0;
- newsk->sk_priority = 0;
- newsk->sk_incoming_cpu = raw_smp_processor_id();
- if (likely(newsk->sk_net_refcnt))
- sock_inuse_add(sock_net(newsk), 1);
+ cgroup_sk_clone(&newsk->sk_cgrp_data);
- /*
- * Before updating sk_refcnt, we must commit prior changes to memory
- * (Documentation/RCU/rculist_nulls.txt for details)
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter != NULL)
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
*/
- smp_wmb();
- refcount_set(&newsk->sk_refcnt, 2);
+ is_charged = sk_filter_charge(newsk, filter);
+ RCU_INIT_POINTER(newsk->sk_filter, filter);
+ rcu_read_unlock();
- /*
- * Increment the counter in the same struct proto as the master
- * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
- * is the same as sk->sk_prot->socks, as this field was copied
- * with memcpy).
- *
- * This _changes_ the previous behaviour, where
- * tcp_create_openreq_child always was incrementing the
- * equivalent to tcp_prot->socks (inet_sock_nr), so this have
- * to be taken into account in all callers. -acme
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
*/
- sk_refcnt_debug_inc(newsk);
- sk_set_socket(newsk, NULL);
- RCU_INIT_POINTER(newsk->sk_wq, NULL);
-
- if (newsk->sk_prot->sockets_allocated)
- sk_sockets_allocated_inc(newsk);
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
- if (sock_needs_netstamp(sk) &&
- newsk->sk_flags & SK_FLAGS_TIMESTAMP)
- net_enable_timestamp();
+ if (bpf_sk_storage_clone(sk, newsk)) {
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
}
+
+ /* Clear sk_user_data if parent had the pointer tagged
+ * as not suitable for copying when cloning.
+ */
+ if (sk_user_data_is_nocopy(newsk))
+ newsk->sk_user_data = NULL;
+
+ newsk->sk_err = 0;
+ newsk->sk_err_soft = 0;
+ newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
+
+ /* Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.rst for details)
+ */
+ smp_wmb();
+ refcount_set(&newsk->sk_refcnt, 2);
+
+ /* Increment the counter in the same struct proto as the master
+ * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+ * is the same as sk->sk_prot->socks, as this field was copied
+ * with memcpy).
+ *
+ * This _changes_ the previous behaviour, where
+ * tcp_create_openreq_child always was incrementing the
+ * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+ * to be taken into account in all callers. -acme
+ */
+ sk_refcnt_debug_inc(newsk);
+ sk_set_socket(newsk, NULL);
+ sk_tx_queue_clear(newsk);
+ RCU_INIT_POINTER(newsk->sk_wq, NULL);
+
+ if (newsk->sk_prot->sockets_allocated)
+ sk_sockets_allocated_inc(newsk);
+
+ if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
out:
return newsk;
}
@@ -1917,22 +2351,42 @@ void sk_free_unlock_clone(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+static void sk_trim_gso_size(struct sock *sk)
+{
+ if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
+ return;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6 &&
+ sk_is_tcp(sk) &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return;
+#endif
+ sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
+}
+
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
u32 max_segs = 1;
sk_dst_set(sk, dst);
- sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
+ sk->sk_route_caps = dst->dev->features;
+ if (sk_is_tcp(sk))
+ sk->sk_route_caps |= NETIF_F_GSO;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
- sk->sk_route_caps &= ~sk->sk_route_nocaps;
+ if (unlikely(sk->sk_gso_disabled))
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
if (sk_can_gso(sk)) {
if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
- sk->sk_gso_max_size = dst->dev->gso_max_size;
- max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
+ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+ sk_trim_gso_size(sk);
+ sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
+ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
}
}
sk->sk_gso_max_segs = max_segs;
@@ -1951,8 +2405,20 @@ void sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
unsigned int len = skb->truesize;
+ bool free;
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+ if (sock_flag(sk, SOCK_RCU_FREE) &&
+ sk->sk_write_space == sock_def_write_space) {
+ rcu_read_lock();
+ free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
+ sock_def_write_space_wfree(sk);
+ rcu_read_unlock();
+ if (unlikely(free))
+ __sk_free(sk);
+ return;
+ }
+
/*
* Keep a reference on sk_wmem_alloc, this will be released
* after sk_write_space() call
@@ -2027,16 +2493,10 @@ void skb_orphan_partial(struct sk_buff *skb)
if (skb_is_tcp_pure_ack(skb))
return;
- if (can_skb_orphan_partial(skb)) {
- struct sock *sk = skb->sk;
+ if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
+ return;
- if (refcount_inc_not_zero(&sk->sk_refcnt)) {
- WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
- skb->destructor = sock_efree;
- }
- } else {
- skb_orphan(skb);
- }
+ skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);
@@ -2063,6 +2523,18 @@ void sock_efree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_efree);
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+ if (sk_is_refcounted(skb->sk))
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
+
kuid_t sock_i_uid(struct sock *sk)
{
kuid_t uid;
@@ -2118,7 +2590,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
- sysctl_optmem_max)
+ READ_ONCE(sysctl_optmem_max))
return NULL;
skb = alloc_skb(size, priority);
@@ -2136,8 +2608,10 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned int)size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ int optmem_max = READ_ONCE(sysctl_optmem_max);
+
+ if ((unsigned int)size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
* might sleep.
@@ -2162,7 +2636,7 @@ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
if (WARN_ON_ONCE(!mem))
return;
if (nullify)
- kzfree(mem);
+ kfree_sensitive(mem);
else
kfree(mem);
atomic_sub(size, &sk->sk_omem_alloc);
@@ -2256,13 +2730,6 @@ failure:
}
EXPORT_SYMBOL(sock_alloc_send_pskb);
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
- int noblock, int *errcode)
-{
- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
-}
-EXPORT_SYMBOL(sock_alloc_send_skb);
-
int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
struct sockcm_cookie *sockc)
{
@@ -2270,7 +2737,8 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
switch (cmsg->cmsg_type) {
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
return -EINVAL;
@@ -2344,8 +2812,6 @@ static void sk_leave_memory_pressure(struct sock *sk)
}
}
-/* On 32bit arches, an skb frag is limited to 2^15 */
-#define SKB_FRAG_PAGE_ORDER get_order(32768)
DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
/**
@@ -2403,7 +2869,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
}
EXPORT_SYMBOL(sk_page_frag_refill);
-static void __lock_sock(struct sock *sk)
+void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
@@ -2435,7 +2901,7 @@ void __release_sock(struct sock *sk)
do {
next = skb->next;
prefetch(next);
- WARN_ON_ONCE(skb_dst_is_noref(skb));
+ DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
skb_mark_not_on_list(skb);
sk_backlog_rcv(sk, skb);
@@ -2460,6 +2926,7 @@ void __sk_flush_backlog(struct sock *sk)
__release_sock(sk);
spin_unlock_bh(&sk->sk_lock.slock);
}
+EXPORT_SYMBOL_GPL(__sk_flush_backlog);
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
@@ -2497,12 +2964,16 @@ EXPORT_SYMBOL(sk_wait_data);
*/
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
+ bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
struct proto *prot = sk->sk_prot;
- long allocated = sk_memory_allocated_add(sk, amt);
bool charged = true;
+ long allocated;
- if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
+ sk_memory_allocated_add(sk, amt);
+ allocated = sk_memory_allocated(sk);
+ if (memcg_charge &&
+ !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+ gfp_memcg_charge())))
goto suppress_allocation;
/* Under limit. */
@@ -2556,8 +3027,14 @@ suppress_allocation:
/* Fail only if socket is _under_ its sndbuf.
* In this case we cannot block, so that we have to fail.
*/
- if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
+ /* Force charge with __GFP_NOFAIL */
+ if (memcg_charge && !charged) {
+ mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+ gfp_memcg_charge() | __GFP_NOFAIL);
+ }
return 1;
+ }
}
if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
@@ -2565,12 +3042,11 @@ suppress_allocation:
sk_memory_allocated_sub(sk, amt);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ if (memcg_charge && charged)
mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
return 0;
}
-EXPORT_SYMBOL(__sk_mem_raise_allocated);
/**
* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
@@ -2586,10 +3062,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
int ret, amt = sk_mem_pages(size);
- sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc += amt << PAGE_SHIFT;
ret = __sk_mem_raise_allocated(sk, size, amt, kind);
if (!ret)
- sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+ sk->sk_forward_alloc -= amt << PAGE_SHIFT;
return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -2612,17 +3088,16 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount)
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
-EXPORT_SYMBOL(__sk_mem_reduce_allocated);
/**
* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
* @sk: socket
- * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
*/
void __sk_mem_reclaim(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
- sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ amount >>= PAGE_SHIFT;
+ sk->sk_forward_alloc -= amount << PAGE_SHIFT;
__sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -2692,20 +3167,6 @@ int sock_no_shutdown(struct socket *sock, int how)
}
EXPORT_SYMBOL(sock_no_shutdown);
-int sock_no_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(sock_no_setsockopt);
-
-int sock_no_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(sock_no_getsockopt);
-
int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
return -EOPNOTSUPP;
@@ -2732,6 +3193,21 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *
}
EXPORT_SYMBOL(sock_no_mmap);
+/*
+ * When a file is received (via SCM_RIGHTS, etc), we must bump the
+ * various sock-based usage counts.
+ */
+void __receive_sock(struct file *file)
+{
+ struct socket *sock;
+
+ sock = sock_from_file(file);
+ if (sock) {
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
+ }
+}
+
ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
ssize_t res;
@@ -2811,20 +3287,42 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
+ if (sock_writeable(sk)) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */
- if (sock_writeable(sk))
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
rcu_read_unlock();
}
+/* An optimised version of sock_def_write_space(), should only be called
+ * for SOCK_RCU_FREE sockets under RCU read section and after putting
+ * ->sk_wmem_alloc.
+ */
+static void sock_def_write_space_wfree(struct sock *sk)
+{
+ /* Do not wake up a writer until he can make "significant"
+ * progress. --DaveM
+ */
+ if (sock_writeable(sk)) {
+ struct socket_wq *wq = rcu_dereference(sk->sk_wq);
+
+ /* rely on refcount_sub from sock_wfree() */
+ smp_mb__after_atomic();
+ if (wq && waitqueue_active(&wq->wait))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
+
+ /* Should agree with poll, otherwise some programs break */
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+}
+
static void sock_def_destruct(struct sock *sk)
{
}
@@ -2852,6 +3350,13 @@ void sk_stop_timer(struct sock *sk, struct timer_list* timer)
}
EXPORT_SYMBOL(sk_stop_timer);
+void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
+{
+ if (del_timer_sync(timer))
+ __sock_put(sk);
+}
+EXPORT_SYMBOL(sk_stop_timer_sync);
+
void sock_init_data(struct socket *sock, struct sock *sk)
{
sk_init_common(sk);
@@ -2860,8 +3365,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
timer_setup(&sk->sk_timer, NULL, 0);
sk->sk_allocation = GFP_KERNEL;
- sk->sk_rcvbuf = sysctl_rmem_default;
- sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
@@ -2901,6 +3406,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
+ spin_lock_init(&sk->sk_peer_lock);
+
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
@@ -2914,18 +3421,19 @@ void sock_init_data(struct socket *sock, struct sock *sk)
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
- sk->sk_ll_usec = sysctl_net_busy_read;
+ sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
#endif
sk->sk_max_pacing_rate = ~0UL;
sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
+ sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
sk_rx_queue_clear(sk);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
- * (Documentation/RCU/rculist_nulls.txt for details)
+ * (Documentation/RCU/rculist_nulls.rst for details)
*/
smp_wmb();
refcount_set(&sk->sk_refcnt, 1);
@@ -2935,17 +3443,15 @@ EXPORT_SYMBOL(sock_init_data);
void lock_sock_nested(struct sock *sk, int subclass)
{
+ /* The sk_lock has mutex_lock() semantics here. */
+ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
- if (sk->sk_lock.owned)
+ if (sock_owned_by_user_nocheck(sk))
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
- /*
- * The sk_lock has mutex_lock() semantics here:
- */
- mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
- local_bh_enable();
+ spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(lock_sock_nested);
@@ -2968,41 +3474,37 @@ void release_sock(struct sock *sk)
}
EXPORT_SYMBOL(release_sock);
-/**
- * lock_sock_fast - fast version of lock_sock
- * @sk: socket
- *
- * This version should be used for very small section, where process wont block
- * return false if fast path is taken:
- *
- * sk_lock.slock locked, owned = 0, BH disabled
- *
- * return true if slow path is taken:
- *
- * sk_lock.slock unlocked, owned = 1, BH enabled
- */
-bool lock_sock_fast(struct sock *sk)
+bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
{
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
- if (!sk->sk_lock.owned)
+ if (!sock_owned_by_user_nocheck(sk)) {
/*
- * Note : We must disable BH
+ * Fast path return with bottom halves disabled and
+ * sock::sk_lock.slock held.
+ *
+ * The 'mutex' is not contended and holding
+ * sock::sk_lock.slock prevents all other lockers to
+ * proceed so the corresponding unlock_sock_fast() can
+ * avoid the slow path of release_sock() completely and
+ * just release slock.
+ *
+ * From a semantical POV this is equivalent to 'acquiring'
+ * the 'mutex', hence the corresponding lockdep
+ * mutex_release() has to happen in the fast path of
+ * unlock_sock_fast().
*/
return false;
+ }
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
- /*
- * The sk_lock has mutex_lock() semantics here:
- */
- mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
- local_bh_enable();
+ __acquire(&sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
return true;
}
-EXPORT_SYMBOL(lock_sock_fast);
+EXPORT_SYMBOL(__lock_sock_fast);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
bool timeval, bool time32)
@@ -3108,24 +3610,11 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_getsockopt != NULL)
- return sk->sk_prot->compat_getsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_getsockopt);
-#endif
-
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
@@ -3133,8 +3622,7 @@ int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int addr_len = 0;
int err;
- err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
- flags & ~MSG_DONTWAIT, &addr_len);
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
@@ -3145,35 +3633,22 @@ EXPORT_SYMBOL(sock_common_recvmsg);
* Set socket options on an inet socket.
*/
int sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_setsockopt != NULL)
- return sk->sk_prot->compat_setsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_setsockopt);
-#endif
-
void sk_common_release(struct sock *sk)
{
if (sk->sk_prot->destroy)
sk->sk_prot->destroy(sk);
/*
- * Observation: when sock_common_release is called, processes have
+ * Observation: when sk_common_release is called, processes have
* no access to socket. But net still has.
* Step one, detach it from networking:
*
@@ -3220,19 +3695,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
}
#ifdef CONFIG_PROC_FS
-#define PROTO_INUSE_NR 64 /* should be enough for the first time */
-struct prot_inuse {
- int val[PROTO_INUSE_NR];
-};
-
static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
-void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
-{
- __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
-}
-EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
-
int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
int cpu, idx = prot->inuse_idx;
@@ -3245,17 +3709,12 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
-static void sock_inuse_add(struct net *net, int val)
-{
- this_cpu_add(*net->core.sock_inuse, val);
-}
-
int sock_inuse_get(struct net *net)
{
int cpu, res = 0;
for_each_possible_cpu(cpu)
- res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+ res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
return res;
}
@@ -3267,22 +3726,12 @@ static int __net_init sock_inuse_init_net(struct net *net)
net->core.prot_inuse = alloc_percpu(struct prot_inuse);
if (net->core.prot_inuse == NULL)
return -ENOMEM;
-
- net->core.sock_inuse = alloc_percpu(int);
- if (net->core.sock_inuse == NULL)
- goto out;
-
return 0;
-
-out:
- free_percpu(net->core.prot_inuse);
- return -ENOMEM;
}
static void __net_exit sock_inuse_exit_net(struct net *net)
{
free_percpu(net->core.prot_inuse);
- free_percpu(net->core.sock_inuse);
}
static struct pernet_operations net_inuse_ops = {
@@ -3328,10 +3777,43 @@ static inline void release_proto_idx(struct proto *prot)
{
}
-static void sock_inuse_add(struct net *net, int val)
+#endif
+
+static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
{
+ if (!twsk_prot)
+ return;
+ kfree(twsk_prot->twsk_slab_name);
+ twsk_prot->twsk_slab_name = NULL;
+ kmem_cache_destroy(twsk_prot->twsk_slab);
+ twsk_prot->twsk_slab = NULL;
+}
+
+static int tw_prot_init(const struct proto *prot)
+{
+ struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
+
+ if (!twsk_prot)
+ return 0;
+
+ twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
+ prot->name);
+ if (!twsk_prot->twsk_slab_name)
+ return -ENOMEM;
+
+ twsk_prot->twsk_slab =
+ kmem_cache_create(twsk_prot->twsk_slab_name,
+ twsk_prot->twsk_obj_size, 0,
+ SLAB_ACCOUNT | prot->slab_flags,
+ NULL);
+ if (!twsk_prot->twsk_slab) {
+ pr_crit("%s: Can't create timewait sock SLAB cache!\n",
+ prot->name);
+ return -ENOMEM;
+ }
+
+ return 0;
}
-#endif
static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
@@ -3372,6 +3854,14 @@ int proto_register(struct proto *prot, int alloc_slab)
{
int ret = -ENOBUFS;
+ if (prot->memory_allocated && !prot->sysctl_mem) {
+ pr_err("%s: missing sysctl_mem\n", prot->name);
+ return -EINVAL;
+ }
+ if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
+ pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
+ return -EINVAL;
+ }
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,
@@ -3389,37 +3879,23 @@ int proto_register(struct proto *prot, int alloc_slab)
if (req_prot_init(prot))
goto out_free_request_sock_slab;
- if (prot->twsk_prot != NULL) {
- prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
-
- if (prot->twsk_prot->twsk_slab_name == NULL)
- goto out_free_request_sock_slab;
-
- prot->twsk_prot->twsk_slab =
- kmem_cache_create(prot->twsk_prot->twsk_slab_name,
- prot->twsk_prot->twsk_obj_size,
- 0,
- SLAB_ACCOUNT |
- prot->slab_flags,
- NULL);
- if (prot->twsk_prot->twsk_slab == NULL)
- goto out_free_timewait_sock_slab_name;
- }
+ if (tw_prot_init(prot))
+ goto out_free_timewait_sock_slab;
}
mutex_lock(&proto_list_mutex);
ret = assign_proto_idx(prot);
if (ret) {
mutex_unlock(&proto_list_mutex);
- goto out_free_timewait_sock_slab_name;
+ goto out_free_timewait_sock_slab;
}
list_add(&prot->node, &proto_list);
mutex_unlock(&proto_list_mutex);
return ret;
-out_free_timewait_sock_slab_name:
- if (alloc_slab && prot->twsk_prot)
- kfree(prot->twsk_prot->twsk_slab_name);
+out_free_timewait_sock_slab:
+ if (alloc_slab)
+ tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
if (alloc_slab) {
req_prot_cleanup(prot->rsk_prot);
@@ -3443,12 +3919,7 @@ void proto_unregister(struct proto *prot)
prot->slab = NULL;
req_prot_cleanup(prot->rsk_prot);
-
- if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
- kmem_cache_destroy(prot->twsk_prot->twsk_slab);
- kfree(prot->twsk_prot->twsk_slab_name);
- prot->twsk_prot->twsk_slab = NULL;
- }
+ tw_prot_cleanup(prot->twsk_prot);
}
EXPORT_SYMBOL(proto_unregister);
@@ -3465,6 +3936,7 @@ int sock_load_diag_module(int family, int protocol)
#ifdef CONFIG_INET
if (family == AF_INET &&
protocol != IPPROTO_RAW &&
+ protocol < MAX_INET_PROTOS &&
!rcu_access_pointer(inet_protos[protocol]))
return -ENOENT;
#endif
@@ -3606,3 +4078,11 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
+{
+ if (!sk->sk_prot->bind_add)
+ return -EOPNOTSUPP;
+ return sk->sk_prot->bind_add(sk, addr, addr_len);
+}
+EXPORT_SYMBOL(sock_bind_add);
diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h
new file mode 100644
index 000000000000..2f396e6bfba5
--- /dev/null
+++ b/net/core/sock_destructor.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_SOCK_DESTRUCTOR_H
+#define _NET_CORE_SOCK_DESTRUCTOR_H
+#include <net/tcp.h>
+
+static inline bool is_skb_wmem(const struct sk_buff *skb)
+{
+ return skb->destructor == sock_wfree ||
+ skb->destructor == __sock_wfree ||
+ (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);
+}
+#endif
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c13ffbd33d8d..f7cf74cdd3db 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -1,5 +1,6 @@
/* License: GPL */
+#include <linux/filter.h>
#include <linux/mutex.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
@@ -11,7 +12,7 @@
#include <linux/tcp.h>
#include <linux/workqueue.h>
#include <linux/nospec.h>
-
+#include <linux/cookie.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
@@ -19,16 +20,17 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
static struct workqueue_struct *broadcast_wq;
-static atomic64_t cookie_gen;
-u64 sock_gen_cookie(struct sock *sk)
+DEFINE_COOKIE(sock_cookie);
+
+u64 __sock_gen_cookie(struct sock *sk)
{
while (1) {
u64 res = atomic64_read(&sk->sk_cookie);
if (res)
return res;
- res = atomic64_inc_return(&cookie_gen);
+ res = gen_cookie_next(&sock_cookie);
atomic64_cmpxchg(&sk->sk_cookie, 0, res);
}
}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 085cef5857bb..81beb16ab1eb 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
#include <linux/bpf.h>
+#include <linux/btf_ids.h>
#include <linux/filter.h>
#include <linux/errno.h>
#include <linux/file.h>
@@ -10,6 +11,8 @@
#include <linux/skmsg.h>
#include <linux/list.h>
#include <linux/jhash.h>
+#include <linux/sock_diag.h>
+#include <net/udp.h>
struct bpf_stab {
struct bpf_map map;
@@ -21,43 +24,39 @@ struct bpf_stab {
#define SOCK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which);
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
+
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
{
struct bpf_stab *stab;
- u64 cost;
- int err;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
if (attr->max_entries == 0 ||
attr->key_size != 4 ||
- attr->value_size != 4 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
- stab = kzalloc(sizeof(*stab), GFP_USER);
+ stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE);
if (!stab)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&stab->map, attr);
raw_spin_lock_init(&stab->lock);
- /* Make sure page count doesn't overflow. */
- cost = (u64) stab->map.max_entries * sizeof(struct sock *);
- err = bpf_map_charge_init(&stab->map.memory, cost);
- if (err)
- goto free_stab;
-
- stab->sks = bpf_map_area_alloc(stab->map.max_entries *
+ stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries *
sizeof(struct sock *),
stab->map.numa_node);
- if (stab->sks)
- return &stab->map;
- err = -ENOMEM;
- bpf_map_charge_finish(&stab->map.memory);
-free_stab:
- kfree(stab);
- return ERR_PTR(err);
+ if (!stab->sks) {
+ bpf_map_area_free(stab);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &stab->map;
}
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -67,11 +66,49 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
struct fd f;
int ret;
+ if (attr->attach_flags || attr->replace_bpf_fd)
+ return -EINVAL;
+
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
- ret = sock_map_prog_update(map, prog, attr->attach_type);
+ ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);
+ fdput(f);
+ return ret;
+}
+
+int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
+{
+ u32 ufd = attr->target_fd;
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ struct fd f;
+ int ret;
+
+ if (attr->attach_flags || attr->replace_bpf_fd)
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ prog = bpf_prog_get(attr->attach_bpf_fd);
+ if (IS_ERR(prog)) {
+ ret = PTR_ERR(prog);
+ goto put_map;
+ }
+
+ if (prog->type != ptype) {
+ ret = -EINVAL;
+ goto put_prog;
+ }
+
+ ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
+put_prog:
+ bpf_prog_put(prog);
+put_map:
fdput(f);
return ret;
}
@@ -106,8 +143,8 @@ static void sock_map_add_link(struct sk_psock *psock,
static void sock_map_del_link(struct sock *sk,
struct sk_psock *psock, void *link_raw)
{
+ bool strp_stop = false, verdict_stop = false;
struct sk_psock_link *link, *tmp;
- bool strp_stop = false;
spin_lock_bh(&psock->link_lock);
list_for_each_entry_safe(link, tmp, &psock->link, list) {
@@ -115,16 +152,26 @@ static void sock_map_del_link(struct sock *sk,
struct bpf_map *map = link->map;
struct bpf_stab *stab = container_of(map, struct bpf_stab,
map);
- if (psock->parser.enabled && stab->progs.skb_parser)
+ if (psock->saved_data_ready && stab->progs.stream_parser)
strp_stop = true;
+ if (psock->saved_data_ready && stab->progs.stream_verdict)
+ verdict_stop = true;
+ if (psock->saved_data_ready && stab->progs.skb_verdict)
+ verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
}
}
spin_unlock_bh(&psock->link_lock);
- if (strp_stop) {
+ if (strp_stop || verdict_stop) {
write_lock_bh(&sk->sk_callback_lock);
- sk_psock_stop_strp(sk, psock);
+ if (strp_stop)
+ sk_psock_stop_strp(sk, psock);
+ if (verdict_stop)
+ sk_psock_stop_verdict(sk, psock);
+
+ if (psock->psock_update_sk_prot)
+ psock->psock_update_sk_prot(sk, psock, false);
write_unlock_bh(&sk->sk_callback_lock);
}
}
@@ -139,25 +186,57 @@ static void sock_map_unref(struct sock *sk, void *link_raw)
}
}
-static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
- struct sock *sk)
+static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
+{
+ if (!sk->sk_prot->psock_update_sk_prot)
+ return -EINVAL;
+ psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot;
+ return sk->sk_prot->psock_update_sk_prot(sk, psock, false);
+}
+
+static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
{
- struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
- bool skb_progs, sk_psock_is_new = false;
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock) {
+ if (sk->sk_prot->close != sock_map_close) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ psock = ERR_PTR(-EBUSY);
+ }
+out:
+ rcu_read_unlock();
+ return psock;
+}
+
+static int sock_map_link(struct bpf_map *map, struct sock *sk)
+{
+ struct sk_psock_progs *progs = sock_map_progs(map);
+ struct bpf_prog *stream_verdict = NULL;
+ struct bpf_prog *stream_parser = NULL;
+ struct bpf_prog *skb_verdict = NULL;
+ struct bpf_prog *msg_parser = NULL;
struct sk_psock *psock;
int ret;
- skb_verdict = READ_ONCE(progs->skb_verdict);
- skb_parser = READ_ONCE(progs->skb_parser);
- skb_progs = skb_parser && skb_verdict;
- if (skb_progs) {
- skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
- if (IS_ERR(skb_verdict))
- return PTR_ERR(skb_verdict);
- skb_parser = bpf_prog_inc_not_zero(skb_parser);
- if (IS_ERR(skb_parser)) {
- bpf_prog_put(skb_verdict);
- return PTR_ERR(skb_parser);
+ stream_verdict = READ_ONCE(progs->stream_verdict);
+ if (stream_verdict) {
+ stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
+ if (IS_ERR(stream_verdict))
+ return PTR_ERR(stream_verdict);
+ }
+
+ stream_parser = READ_ONCE(progs->stream_parser);
+ if (stream_parser) {
+ stream_parser = bpf_prog_inc_not_zero(stream_parser);
+ if (IS_ERR(stream_parser)) {
+ ret = PTR_ERR(stream_parser);
+ goto out_put_stream_verdict;
}
}
@@ -166,11 +245,20 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
msg_parser = bpf_prog_inc_not_zero(msg_parser);
if (IS_ERR(msg_parser)) {
ret = PTR_ERR(msg_parser);
- goto out;
+ goto out_put_stream_parser;
}
}
- psock = sk_psock_get_checked(sk);
+ skb_verdict = READ_ONCE(progs->skb_verdict);
+ if (skb_verdict) {
+ skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+ if (IS_ERR(skb_verdict)) {
+ ret = PTR_ERR(skb_verdict);
+ goto out_put_msg_parser;
+ }
+ }
+
+ psock = sock_map_psock_get_checked(sk);
if (IS_ERR(psock)) {
ret = PTR_ERR(psock);
goto out_progs;
@@ -178,53 +266,70 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
if (psock) {
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
- (skb_progs && READ_ONCE(psock->progs.skb_parser))) {
+ (stream_parser && READ_ONCE(psock->progs.stream_parser)) ||
+ (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+ (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) ||
+ (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+ (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
sk_psock_put(sk, psock);
ret = -EBUSY;
goto out_progs;
}
} else {
psock = sk_psock_init(sk, map->numa_node);
- if (!psock) {
- ret = -ENOMEM;
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
goto out_progs;
}
- sk_psock_is_new = true;
}
if (msg_parser)
psock_set_prog(&psock->progs.msg_parser, msg_parser);
- if (sk_psock_is_new) {
- ret = tcp_bpf_init(sk);
- if (ret < 0)
- goto out_drop;
- } else {
- tcp_bpf_reinit(sk);
+ if (stream_parser)
+ psock_set_prog(&psock->progs.stream_parser, stream_parser);
+ if (stream_verdict)
+ psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
+ if (skb_verdict)
+ psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+
+ /* msg_* and stream_* programs references tracked in psock after this
+ * point. Reference dec and cleanup will occur through psock destructor
+ */
+ ret = sock_map_init_proto(sk, psock);
+ if (ret < 0) {
+ sk_psock_put(sk, psock);
+ goto out;
}
write_lock_bh(&sk->sk_callback_lock);
- if (skb_progs && !psock->parser.enabled) {
+ if (stream_parser && stream_verdict && !psock->saved_data_ready) {
ret = sk_psock_init_strp(sk, psock);
if (ret) {
write_unlock_bh(&sk->sk_callback_lock);
- goto out_drop;
+ sk_psock_put(sk, psock);
+ goto out;
}
- psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
- psock_set_prog(&psock->progs.skb_parser, skb_parser);
sk_psock_start_strp(sk, psock);
+ } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
+ sk_psock_start_verdict(sk,psock);
+ } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) {
+ sk_psock_start_verdict(sk, psock);
}
write_unlock_bh(&sk->sk_callback_lock);
return 0;
-out_drop:
- sk_psock_put(sk, psock);
out_progs:
+ if (skb_verdict)
+ bpf_prog_put(skb_verdict);
+out_put_msg_parser:
if (msg_parser)
bpf_prog_put(msg_parser);
+out_put_stream_parser:
+ if (stream_parser)
+ bpf_prog_put(stream_parser);
+out_put_stream_verdict:
+ if (stream_verdict)
+ bpf_prog_put(stream_verdict);
out:
- if (skb_progs) {
- bpf_prog_put(skb_verdict);
- bpf_prog_put(skb_parser);
- }
return ret;
}
@@ -233,8 +338,11 @@ static void sock_map_free(struct bpf_map *map)
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
int i;
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
synchronize_rcu();
- raw_spin_lock_bh(&stab->lock);
for (i = 0; i < stab->map.max_entries; i++) {
struct sock **psk = &stab->sks[i];
struct sock *sk;
@@ -248,13 +356,12 @@ static void sock_map_free(struct bpf_map *map)
release_sock(sk);
}
}
- raw_spin_unlock_bh(&stab->lock);
/* wait for psock readers accessing its map link */
synchronize_rcu();
bpf_map_area_free(stab->sks);
- kfree(stab);
+ bpf_map_area_free(stab);
}
static void sock_map_release_progs(struct bpf_map *map)
@@ -275,7 +382,29 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
static void *sock_map_lookup(struct bpf_map *map, void *key)
{
- return ERR_PTR(-EOPNOTSUPP);
+ struct sock *sk;
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk)
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
+}
+
+static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ __sock_gen_cookie(sk);
+ return &sk->sk_cookie;
}
static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
@@ -338,7 +467,6 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
struct sock *sk, u64 flags)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
- struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_psock_link *link;
struct sk_psock *psock;
struct sock *osk;
@@ -349,14 +477,12 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
return -EINVAL;
if (unlikely(idx >= map->max_entries))
return -E2BIG;
- if (unlikely(rcu_access_pointer(icsk->icsk_ulp_data)))
- return -EINVAL;
link = sk_psock_init_link();
if (!link)
return -ENOMEM;
- ret = sock_map_link(map, &stab->progs, sk);
+ ret = sock_map_link(map, sk);
if (ret < 0)
goto out_free;
@@ -391,23 +517,47 @@ out_free:
static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
{
return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
- ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+ ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB;
+}
+
+static bool sock_map_redirect_allowed(const struct sock *sk)
+{
+ if (sk_is_tcp(sk))
+ return sk->sk_state != TCP_LISTEN;
+ else
+ return sk->sk_state == TCP_ESTABLISHED;
}
static bool sock_map_sk_is_suitable(const struct sock *sk)
{
- return sk->sk_type == SOCK_STREAM &&
- sk->sk_protocol == IPPROTO_TCP;
+ return !!sk->sk_prot->psock_update_sk_prot;
}
-static int sock_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static bool sock_map_sk_state_allowed(const struct sock *sk)
+{
+ if (sk_is_tcp(sk))
+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
+ return true;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags);
+
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
- u32 ufd = *(u32 *)value;
- u32 idx = *(u32 *)key;
struct socket *sock;
struct sock *sk;
int ret;
+ u64 ufd;
+
+ if (map->value_size == sizeof(u64))
+ ufd = *(u64 *)value;
+ else
+ ufd = *(u32 *)value;
+ if (ufd > S32_MAX)
+ return -EINVAL;
sock = sockfd_lookup(ufd, &ret);
if (!sock)
@@ -423,13 +573,40 @@ static int sock_map_update_elem(struct bpf_map *map, void *key,
}
sock_map_sk_acquire(sk);
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
else
- ret = sock_map_update_common(map, idx, sk, flags);
+ ret = sock_hash_update_common(map, key, sk, flags);
sock_map_sk_release(sk);
out:
- fput(sock->file);
+ sockfd_put(sock);
+ return ret;
+}
+
+static int sock_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct sock *sk = (struct sock *)value;
+ int ret;
+
+ if (unlikely(!sk || !sk_fullsock(sk)))
+ return -EINVAL;
+
+ if (!sock_map_sk_is_suitable(sk))
+ return -EOPNOTSUPP;
+
+ local_bh_disable();
+ bh_lock_sock(sk);
+ if (!sock_map_sk_state_allowed(sk))
+ ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
+ bh_unlock_sock(sk);
+ local_bh_enable();
return ret;
}
@@ -459,14 +636,16 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
}
@@ -483,12 +662,17 @@ const struct bpf_func_proto bpf_sk_redirect_map_proto = {
BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
struct bpf_map *, map, u32, key, u64, flags)
{
+ struct sock *sk;
+
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- msg->flags = flags;
- msg->sk_redir = __sock_map_lookup_elem(map, key);
- if (!msg->sk_redir)
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
return SK_PASS;
}
@@ -502,33 +686,154 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
.arg4_type = ARG_ANYTHING,
};
+struct sock_map_seq_info {
+ struct bpf_map *map;
+ struct sock *sk;
+ u32 index;
+};
+
+struct bpf_iter__sockmap {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+ __bpf_md_ptr(void *, key);
+ __bpf_md_ptr(struct sock *, sk);
+};
+
+DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
+ struct bpf_map *map, void *key,
+ struct sock *sk)
+
+static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
+{
+ if (unlikely(info->index >= info->map->max_entries))
+ return NULL;
+
+ info->sk = __sock_map_lookup_elem(info->map, info->index);
+
+ /* can't return sk directly, since that might be NULL */
+ return info;
+}
+
+static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_map_seq_stop */
+ rcu_read_lock();
+ return sock_map_seq_lookup_elem(info);
+}
+
+static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ ++*pos;
+ ++info->index;
+
+ return sock_map_seq_lookup_elem(info);
+}
+
+static int sock_map_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !v);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (v) {
+ ctx.key = &info->index;
+ ctx.sk = info->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_map_seq_show(seq, NULL);
+
+ /* pairs with sock_map_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_map_seq_ops = {
+ .start = sock_map_seq_start,
+ .next = sock_map_seq_next,
+ .stop = sock_map_seq_stop,
+ .show = sock_map_seq_show,
+};
+
+static int sock_map_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_map_seq_info *info = priv_data;
+
+ bpf_map_inc_with_uref(aux->map);
+ info->map = aux->map;
+ return 0;
+}
+
+static void sock_map_fini_seq_private(void *priv_data)
+{
+ struct sock_map_seq_info *info = priv_data;
+
+ bpf_map_put_with_uref(info->map);
+}
+
+static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
+ .seq_ops = &sock_map_seq_ops,
+ .init_seq_private = sock_map_init_seq_private,
+ .fini_seq_private = sock_map_fini_seq_private,
+ .seq_priv_size = sizeof(struct sock_map_seq_info),
+};
+
+BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab)
const struct bpf_map_ops sock_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
.map_get_next_key = sock_map_get_next_key,
+ .map_lookup_elem_sys_only = sock_map_lookup_sys,
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
.map_lookup_elem = sock_map_lookup,
.map_release_uref = sock_map_release_progs,
.map_check_btf = map_check_no_btf,
+ .map_btf_id = &sock_map_btf_ids[0],
+ .iter_seq_info = &sock_map_iter_seq_info,
};
-struct bpf_htab_elem {
+struct bpf_shtab_elem {
struct rcu_head rcu;
u32 hash;
struct sock *sk;
struct hlist_node node;
- u8 key[0];
+ u8 key[];
};
-struct bpf_htab_bucket {
+struct bpf_shtab_bucket {
struct hlist_head head;
raw_spinlock_t lock;
};
-struct bpf_htab {
+struct bpf_shtab {
struct bpf_map map;
- struct bpf_htab_bucket *buckets;
+ struct bpf_shtab_bucket *buckets;
u32 buckets_num;
u32 elem_size;
struct sk_psock_progs progs;
@@ -540,17 +845,17 @@ static inline u32 sock_hash_bucket_hash(const void *key, u32 len)
return jhash(key, len, 0);
}
-static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab,
- u32 hash)
+static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab,
+ u32 hash)
{
return &htab->buckets[hash & (htab->buckets_num - 1)];
}
-static struct bpf_htab_elem *
+static struct bpf_shtab_elem *
sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
u32 key_size)
{
- struct bpf_htab_elem *elem;
+ struct bpf_shtab_elem *elem;
hlist_for_each_entry_rcu(elem, head, node) {
if (elem->hash == hash &&
@@ -563,10 +868,10 @@ sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
u32 key_size = map->key_size, hash;
- struct bpf_htab_bucket *bucket;
- struct bpf_htab_elem *elem;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -577,8 +882,8 @@ static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
return elem ? elem->sk : NULL;
}
-static void sock_hash_free_elem(struct bpf_htab *htab,
- struct bpf_htab_elem *elem)
+static void sock_hash_free_elem(struct bpf_shtab *htab,
+ struct bpf_shtab_elem *elem)
{
atomic_dec(&htab->count);
kfree_rcu(elem, rcu);
@@ -587,9 +892,9 @@ static void sock_hash_free_elem(struct bpf_htab *htab,
static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
void *link_raw)
{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct bpf_htab_elem *elem_probe, *elem = link_raw;
- struct bpf_htab_bucket *bucket;
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ struct bpf_shtab_elem *elem_probe, *elem = link_raw;
+ struct bpf_shtab_bucket *bucket;
WARN_ON_ONCE(!rcu_read_lock_held());
bucket = sock_hash_select_bucket(htab, elem->hash);
@@ -611,10 +916,10 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
static int sock_hash_delete_elem(struct bpf_map *map, void *key)
{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
u32 hash, key_size = map->key_size;
- struct bpf_htab_bucket *bucket;
- struct bpf_htab_elem *elem;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
int ret = -ENOENT;
hash = sock_hash_bucket_hash(key, key_size);
@@ -632,12 +937,12 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key)
return ret;
}
-static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab,
- void *key, u32 key_size,
- u32 hash, struct sock *sk,
- struct bpf_htab_elem *old)
+static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab,
+ void *key, u32 key_size,
+ u32 hash, struct sock *sk,
+ struct bpf_shtab_elem *old)
{
- struct bpf_htab_elem *new;
+ struct bpf_shtab_elem *new;
if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
if (!old) {
@@ -646,8 +951,9 @@ static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab,
}
}
- new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
- htab->map.numa_node);
+ new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
if (!new) {
atomic_dec(&htab->count);
return ERR_PTR(-ENOMEM);
@@ -661,11 +967,10 @@ static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab,
static int sock_hash_update_common(struct bpf_map *map, void *key,
struct sock *sk, u64 flags)
{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct inet_connection_sock *icsk = inet_csk(sk);
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
u32 key_size = map->key_size, hash;
- struct bpf_htab_elem *elem, *elem_new;
- struct bpf_htab_bucket *bucket;
+ struct bpf_shtab_elem *elem, *elem_new;
+ struct bpf_shtab_bucket *bucket;
struct sk_psock_link *link;
struct sk_psock *psock;
int ret;
@@ -673,14 +978,12 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
WARN_ON_ONCE(!rcu_read_lock_held());
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
- if (unlikely(icsk->icsk_ulp_data))
- return -EINVAL;
link = sk_psock_init_link();
if (!link)
return -ENOMEM;
- ret = sock_map_link(map, &htab->progs, sk);
+ ret = sock_map_link(map, sk);
if (ret < 0)
goto out_free;
@@ -726,43 +1029,11 @@ out_free:
return ret;
}
-static int sock_hash_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
-{
- u32 ufd = *(u32 *)value;
- struct socket *sock;
- struct sock *sk;
- int ret;
-
- sock = sockfd_lookup(ufd, &ret);
- if (!sock)
- return ret;
- sk = sock->sk;
- if (!sk) {
- ret = -EINVAL;
- goto out;
- }
- if (!sock_map_sk_is_suitable(sk)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- sock_map_sk_acquire(sk);
- if (sk->sk_state != TCP_ESTABLISHED)
- ret = -EOPNOTSUPP;
- else
- ret = sock_hash_update_common(map, key, sk, flags);
- sock_map_sk_release(sk);
-out:
- fput(sock->file);
- return ret;
-}
-
static int sock_hash_get_next_key(struct bpf_map *map, void *key,
void *key_next)
{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct bpf_htab_elem *elem, *elem_next;
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ struct bpf_shtab_elem *elem, *elem_next;
u32 hash, key_size = map->key_size;
struct hlist_head *head;
int i = 0;
@@ -775,8 +1046,8 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key,
if (!elem)
goto find_first_elem;
- elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)),
- struct bpf_htab_elem, node);
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)),
+ struct bpf_shtab_elem, node);
if (elem_next) {
memcpy(key_next, elem_next->key, key_size);
return 0;
@@ -787,8 +1058,8 @@ static int sock_hash_get_next_key(struct bpf_map *map, void *key,
find_first_elem:
for (; i < htab->buckets_num; i++) {
head = &sock_hash_select_bucket(htab, i)->head;
- elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
- struct bpf_htab_elem, node);
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)),
+ struct bpf_shtab_elem, node);
if (elem_next) {
memcpy(key_next, elem_next->key, key_size);
return 0;
@@ -800,44 +1071,37 @@ find_first_elem:
static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
{
- struct bpf_htab *htab;
+ struct bpf_shtab *htab;
int i, err;
- u64 cost;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
if (attr->max_entries == 0 ||
attr->key_size == 0 ||
- attr->value_size != 4 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->key_size > MAX_BPF_STACK)
return ERR_PTR(-E2BIG);
- htab = kzalloc(sizeof(*htab), GFP_USER);
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&htab->map, attr);
htab->buckets_num = roundup_pow_of_two(htab->map.max_entries);
- htab->elem_size = sizeof(struct bpf_htab_elem) +
+ htab->elem_size = sizeof(struct bpf_shtab_elem) +
round_up(htab->map.key_size, 8);
if (htab->buckets_num == 0 ||
- htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) {
- err = -EINVAL;
- goto free_htab;
- }
-
- cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) +
- (u64) htab->elem_size * htab->map.max_entries;
- if (cost >= U32_MAX - PAGE_SIZE) {
+ htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) {
err = -EINVAL;
goto free_htab;
}
htab->buckets = bpf_map_area_alloc(htab->buckets_num *
- sizeof(struct bpf_htab_bucket),
+ sizeof(struct bpf_shtab_bucket),
htab->map.numa_node);
if (!htab->buckets) {
err = -ENOMEM;
@@ -851,43 +1115,92 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
return &htab->map;
free_htab:
- kfree(htab);
+ bpf_map_area_free(htab);
return ERR_PTR(err);
}
static void sock_hash_free(struct bpf_map *map)
{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct bpf_htab_bucket *bucket;
- struct bpf_htab_elem *elem;
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ struct bpf_shtab_bucket *bucket;
+ struct hlist_head unlink_list;
+ struct bpf_shtab_elem *elem;
struct hlist_node *node;
int i;
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
synchronize_rcu();
for (i = 0; i < htab->buckets_num; i++) {
bucket = sock_hash_select_bucket(htab, i);
+
+ /* We are racing with sock_hash_delete_from_link to
+ * enter the spin-lock critical section. Every socket on
+ * the list is still linked to sockhash. Since link
+ * exists, psock exists and holds a ref to socket. That
+ * lets us to grab a socket ref too.
+ */
raw_spin_lock_bh(&bucket->lock);
- hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
- hlist_del_rcu(&elem->node);
+ hlist_for_each_entry(elem, &bucket->head, node)
+ sock_hold(elem->sk);
+ hlist_move_list(&bucket->head, &unlink_list);
+ raw_spin_unlock_bh(&bucket->lock);
+
+ /* Process removed entries out of atomic context to
+ * block for socket lock before deleting the psock's
+ * link to sockhash.
+ */
+ hlist_for_each_entry_safe(elem, node, &unlink_list, node) {
+ hlist_del(&elem->node);
lock_sock(elem->sk);
rcu_read_lock();
sock_map_unref(elem->sk, elem);
rcu_read_unlock();
release_sock(elem->sk);
+ sock_put(elem->sk);
+ sock_hash_free_elem(htab, elem);
}
- raw_spin_unlock_bh(&bucket->lock);
}
/* wait for psock readers accessing its map link */
synchronize_rcu();
bpf_map_area_free(htab->buckets);
- kfree(htab);
+ bpf_map_area_free(htab);
+}
+
+static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ __sock_gen_cookie(sk);
+ return &sk->sk_cookie;
+}
+
+static void *sock_hash_lookup(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk)
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
}
static void sock_hash_release_progs(struct bpf_map *map)
{
- psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs);
+ psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs);
}
BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
@@ -915,14 +1228,16 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = {
BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
struct bpf_map *, map, void *, key, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
return SK_PASS;
}
@@ -939,12 +1254,17 @@ const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
struct bpf_map *, map, void *, key, u64, flags)
{
+ struct sock *sk;
+
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- msg->flags = flags;
- msg->sk_redir = __sock_hash_lookup_elem(map, key);
- if (!msg->sk_redir)
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
return SK_PASS;
}
@@ -958,15 +1278,144 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
.arg4_type = ARG_ANYTHING,
};
+struct sock_hash_seq_info {
+ struct bpf_map *map;
+ struct bpf_shtab *htab;
+ u32 bucket_id;
+};
+
+static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
+ struct bpf_shtab_elem *prev_elem)
+{
+ const struct bpf_shtab *htab = info->htab;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
+ struct hlist_node *node;
+
+ /* try to find next elem in the same bucket */
+ if (prev_elem) {
+ node = rcu_dereference(hlist_next_rcu(&prev_elem->node));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+
+ /* no more elements, continue in the next bucket */
+ info->bucket_id++;
+ }
+
+ for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
+ bucket = &htab->buckets[info->bucket_id];
+ node = rcu_dereference(hlist_first_rcu(&bucket->head));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+ }
+
+ return NULL;
+}
+
+static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_hash_seq_stop */
+ rcu_read_lock();
+ return sock_hash_seq_find_next(info, NULL);
+}
+
+static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ ++*pos;
+ return sock_hash_seq_find_next(info, v);
+}
+
+static int sock_hash_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_shtab_elem *elem = v;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !elem);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ ctx.key = elem->key;
+ ctx.sk = elem->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_hash_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_hash_seq_show(seq, NULL);
+
+ /* pairs with sock_hash_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_hash_seq_ops = {
+ .start = sock_hash_seq_start,
+ .next = sock_hash_seq_next,
+ .stop = sock_hash_seq_stop,
+ .show = sock_hash_seq_show,
+};
+
+static int sock_hash_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_hash_seq_info *info = priv_data;
+
+ bpf_map_inc_with_uref(aux->map);
+ info->map = aux->map;
+ info->htab = container_of(aux->map, struct bpf_shtab, map);
+ return 0;
+}
+
+static void sock_hash_fini_seq_private(void *priv_data)
+{
+ struct sock_hash_seq_info *info = priv_data;
+
+ bpf_map_put_with_uref(info->map);
+}
+
+static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
+ .seq_ops = &sock_hash_seq_ops,
+ .init_seq_private = sock_hash_init_seq_private,
+ .fini_seq_private = sock_hash_fini_seq_private,
+ .seq_priv_size = sizeof(struct sock_hash_seq_info),
+};
+
+BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab)
const struct bpf_map_ops sock_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = sock_hash_alloc,
.map_free = sock_hash_free,
.map_get_next_key = sock_hash_get_next_key,
- .map_update_elem = sock_hash_update_elem,
+ .map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_hash_delete_elem,
- .map_lookup_elem = sock_map_lookup,
+ .map_lookup_elem = sock_hash_lookup,
+ .map_lookup_elem_sys_only = sock_hash_lookup_sys,
.map_release_uref = sock_hash_release_progs,
.map_check_btf = map_check_no_btf,
+ .map_btf_id = &sock_hash_map_btf_ids[0],
+ .iter_seq_info = &sock_hash_iter_seq_info,
};
static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
@@ -975,7 +1424,7 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
case BPF_MAP_TYPE_SOCKMAP:
return &container_of(map, struct bpf_stab, map)->progs;
case BPF_MAP_TYPE_SOCKHASH:
- return &container_of(map, struct bpf_htab, map)->progs;
+ return &container_of(map, struct bpf_shtab, map)->progs;
default:
break;
}
@@ -983,8 +1432,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
return NULL;
}
-int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- u32 which)
+static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
+ u32 which)
{
struct sk_psock_progs *progs = sock_map_progs(map);
@@ -993,13 +1442,22 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
switch (which) {
case BPF_SK_MSG_VERDICT:
- psock_set_prog(&progs->msg_parser, prog);
+ *pprog = &progs->msg_parser;
break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
case BPF_SK_SKB_STREAM_PARSER:
- psock_set_prog(&progs->skb_parser, prog);
+ *pprog = &progs->stream_parser;
break;
+#endif
case BPF_SK_SKB_STREAM_VERDICT:
- psock_set_prog(&progs->skb_verdict, prog);
+ if (progs->skb_verdict)
+ return -EBUSY;
+ *pprog = &progs->stream_verdict;
+ break;
+ case BPF_SK_SKB_VERDICT:
+ if (progs->stream_verdict)
+ return -EBUSY;
+ *pprog = &progs->skb_verdict;
break;
default:
return -EOPNOTSUPP;
@@ -1008,7 +1466,75 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
return 0;
}
-void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, u32 which)
+{
+ struct bpf_prog **pprog;
+ int ret;
+
+ ret = sock_map_prog_lookup(map, &pprog, which);
+ if (ret)
+ return ret;
+
+ if (old)
+ return psock_replace_prog(pprog, prog, old);
+
+ psock_set_prog(pprog, prog);
+ return 0;
+}
+
+int sock_map_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd;
+ struct bpf_prog **pprog;
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ struct fd f;
+ u32 id = 0;
+ int ret;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ rcu_read_lock();
+
+ ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type);
+ if (ret)
+ goto end;
+
+ prog = *pprog;
+ prog_cnt = !prog ? 0 : 1;
+
+ if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+ goto end;
+
+ /* we do not hold the refcnt, the bpf prog may be released
+ * asynchronously and the id would be set to 0.
+ */
+ id = data_race(prog->aux->id);
+ if (id == 0)
+ prog_cnt = 0;
+
+end:
+ rcu_read_unlock();
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) ||
+ (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) ||
+ copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+ ret = -EFAULT;
+
+ fdput(f);
+ return ret;
+}
+
+static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link)
{
switch (link->map->map_type) {
case BPF_MAP_TYPE_SOCKMAP:
@@ -1021,3 +1547,141 @@ void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
break;
}
}
+
+static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ while ((link = sk_psock_link_pop(psock))) {
+ sock_map_unlink(sk, link);
+ sk_psock_free_link(link);
+ }
+}
+
+void sock_map_unhash(struct sock *sk)
+{
+ void (*saved_unhash)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ if (sk->sk_prot->unhash)
+ sk->sk_prot->unhash(sk);
+ return;
+ }
+
+ saved_unhash = psock->saved_unhash;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ saved_unhash(sk);
+}
+EXPORT_SYMBOL_GPL(sock_map_unhash);
+
+void sock_map_destroy(struct sock *sk)
+{
+ void (*saved_destroy)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ if (sk->sk_prot->destroy)
+ sk->sk_prot->destroy(sk);
+ return;
+ }
+
+ saved_destroy = psock->saved_destroy;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ sk_psock_stop(psock);
+ sk_psock_put(sk, psock);
+ saved_destroy(sk);
+}
+EXPORT_SYMBOL_GPL(sock_map_destroy);
+
+void sock_map_close(struct sock *sk, long timeout)
+{
+ void (*saved_close)(struct sock *sk, long timeout);
+ struct sk_psock *psock;
+
+ lock_sock(sk);
+ rcu_read_lock();
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ release_sock(sk);
+ return sk->sk_prot->close(sk, timeout);
+ }
+
+ saved_close = psock->saved_close;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ sk_psock_stop(psock);
+ release_sock(sk);
+ cancel_work_sync(&psock->work);
+ sk_psock_put(sk, psock);
+ saved_close(sk, timeout);
+}
+EXPORT_SYMBOL_GPL(sock_map_close);
+
+static int sock_map_iter_attach_target(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_map *map;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP &&
+ map->map_type != BPF_MAP_TYPE_SOCKHASH)
+ goto put_map;
+
+ if (prog->aux->max_rdonly_access > map->key_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+static struct bpf_iter_reg sock_map_iter_reg = {
+ .target = "sockmap",
+ .attach_target = sock_map_iter_attach_target,
+ .detach_target = sock_map_iter_detach_target,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__sockmap, key),
+ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
+ { offsetof(struct bpf_iter__sockmap, sk),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static int __init bpf_sockmap_iter_init(void)
+{
+ sock_map_iter_reg.ctx_arg_info[1].btf_id =
+ btf_sock_ids[BTF_SOCK_TYPE_SOCK];
+ return bpf_iter_reg_target(&sock_map_iter_reg);
+}
+late_initcall(bpf_sockmap_iter_init);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 91e9f2223c39..fb90e1e00773 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -6,6 +6,7 @@
* selecting the socket index from the array of available sockets.
*/
+#include <net/ip.h>
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
#include <linux/idr.h>
@@ -16,25 +17,90 @@
DEFINE_SPINLOCK(reuseport_lock);
-#define REUSEPORT_MIN_ID 1
static DEFINE_IDA(reuseport_ida);
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+ struct sock_reuseport *reuse, bool bind_inany);
-int reuseport_get_id(struct sock_reuseport *reuse)
+void reuseport_has_conns_set(struct sock *sk)
{
- int id;
+ struct sock_reuseport *reuse;
- if (reuse->reuseport_id)
- return reuse->reuseport_id;
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return;
- id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
- /* Called under reuseport_lock */
- GFP_ATOMIC);
- if (id < 0)
- return id;
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (likely(reuse))
+ reuse->has_conns = 1;
+ spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_has_conns_set);
- reuse->reuseport_id = id;
+static int reuseport_sock_index(struct sock *sk,
+ const struct sock_reuseport *reuse,
+ bool closed)
+{
+ int left, right;
+
+ if (!closed) {
+ left = 0;
+ right = reuse->num_socks;
+ } else {
+ left = reuse->max_socks - reuse->num_closed_socks;
+ right = reuse->max_socks;
+ }
- return reuse->reuseport_id;
+ for (; left < right; left++)
+ if (reuse->socks[left] == sk)
+ return left;
+ return -1;
+}
+
+static void __reuseport_add_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ reuse->socks[reuse->num_socks] = sk;
+ /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
+ smp_wmb();
+ reuse->num_socks++;
+}
+
+static bool __reuseport_detach_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ int i = reuseport_sock_index(sk, reuse, false);
+
+ if (i == -1)
+ return false;
+
+ reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+ reuse->num_socks--;
+
+ return true;
+}
+
+static void __reuseport_add_closed_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
+ /* paired with READ_ONCE() in inet_csk_bind_conflict() */
+ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
+}
+
+static bool __reuseport_detach_closed_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ int i = reuseport_sock_index(sk, reuse, true);
+
+ if (i == -1)
+ return false;
+
+ reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ /* paired with READ_ONCE() in inet_csk_bind_conflict() */
+ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
+
+ return true;
}
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
@@ -55,6 +121,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
+ int id, ret = 0;
/* bh lock used since this function call may precede hlist lock in
* soft irq of receive path or setsockopt from process context
@@ -67,6 +134,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
if (reuse) {
+ if (reuse->num_closed_socks) {
+ /* sk was shutdown()ed before */
+ ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
+ goto out;
+ }
+
/* Only set reuse->bind_inany if the bind_inany is true.
* Otherwise, it will overwrite the reuse->bind_inany
* which was set by the bind/hash path.
@@ -78,19 +151,27 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
- spin_unlock_bh(&reuseport_lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
+ id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+ if (id < 0) {
+ kfree(reuse);
+ ret = id;
+ goto out;
+ }
+
+ reuse->reuseport_id = id;
+ reuse->bind_inany = bind_inany;
reuse->socks[0] = sk;
reuse->num_socks = 1;
- reuse->bind_inany = bind_inany;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
spin_unlock_bh(&reuseport_lock);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(reuseport_alloc);
@@ -100,23 +181,44 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
u32 more_socks_size, i;
more_socks_size = reuse->max_socks * 2U;
- if (more_socks_size > U16_MAX)
+ if (more_socks_size > U16_MAX) {
+ if (reuse->num_closed_socks) {
+ /* Make room by removing a closed sk.
+ * The child has already been migrated.
+ * Only reqsk left at this point.
+ */
+ struct sock *sk;
+
+ sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
+ __reuseport_detach_closed_sock(sk, reuse);
+
+ return reuse;
+ }
+
return NULL;
+ }
more_reuse = __reuseport_alloc(more_socks_size);
if (!more_reuse)
return NULL;
more_reuse->num_socks = reuse->num_socks;
+ more_reuse->num_closed_socks = reuse->num_closed_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
+ more_reuse->has_conns = reuse->has_conns;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
+ memcpy(more_reuse->socks +
+ (more_reuse->max_socks - more_reuse->num_closed_socks),
+ reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
+ reuse->num_closed_socks * sizeof(struct sock *));
more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
- for (i = 0; i < reuse->num_socks; ++i)
+ for (i = 0; i < reuse->max_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
@@ -134,8 +236,7 @@ static void reuseport_free_rcu(struct rcu_head *head)
reuse = container_of(head, struct sock_reuseport, rcu);
sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
- if (reuse->reuseport_id)
- ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
+ ida_free(&reuseport_ida, reuse->reuseport_id);
kfree(reuse);
}
@@ -162,13 +263,21 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
- lockdep_is_held(&reuseport_lock));
+ lockdep_is_held(&reuseport_lock));
+ if (old_reuse && old_reuse->num_closed_socks) {
+ /* sk was shutdown()ed before */
+ int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
+
+ spin_unlock_bh(&reuseport_lock);
+ return err;
+ }
+
if (old_reuse && old_reuse->num_socks != 1) {
spin_unlock_bh(&reuseport_lock);
return -EBUSY;
}
- if (reuse->num_socks == reuse->max_socks) {
+ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
reuse = reuseport_grow(reuse);
if (!reuse) {
spin_unlock_bh(&reuseport_lock);
@@ -176,10 +285,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
}
}
- reuse->socks[reuse->num_socks] = sk;
- /* paired with smp_rmb() in reuseport_select_sock() */
- smp_wmb();
- reuse->num_socks++;
+ __reuseport_add_sock(sk, reuse);
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
spin_unlock_bh(&reuseport_lock);
@@ -190,37 +296,135 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
}
EXPORT_SYMBOL(reuseport_add_sock);
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+ struct sock_reuseport *reuse, bool bind_inany)
+{
+ if (old_reuse == reuse) {
+ /* If sk was in the same reuseport group, just pop sk out of
+ * the closed section and push sk into the listening section.
+ */
+ __reuseport_detach_closed_sock(sk, old_reuse);
+ __reuseport_add_sock(sk, old_reuse);
+ return 0;
+ }
+
+ if (!reuse) {
+ /* In bind()/listen() path, we cannot carry over the eBPF prog
+ * for the shutdown()ed socket. In setsockopt() path, we should
+ * not change the eBPF prog of listening sockets by attaching a
+ * prog to the shutdown()ed socket. Thus, we will allocate a new
+ * reuseport group and detach sk from the old group.
+ */
+ int id;
+
+ reuse = __reuseport_alloc(INIT_SOCKS);
+ if (!reuse)
+ return -ENOMEM;
+
+ id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+ if (id < 0) {
+ kfree(reuse);
+ return id;
+ }
+
+ reuse->reuseport_id = id;
+ reuse->bind_inany = bind_inany;
+ } else {
+ /* Move sk from the old group to the new one if
+ * - all the other listeners in the old group were close()d or
+ * shutdown()ed, and then sk2 has listen()ed on the same port
+ * OR
+ * - sk listen()ed without bind() (or with autobind), was
+ * shutdown()ed, and then listen()s on another port which
+ * sk2 listen()s on.
+ */
+ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
+ reuse = reuseport_grow(reuse);
+ if (!reuse)
+ return -ENOMEM;
+ }
+ }
+
+ __reuseport_detach_closed_sock(sk, old_reuse);
+ __reuseport_add_sock(sk, reuse);
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
+ call_rcu(&old_reuse->rcu, reuseport_free_rcu);
+
+ return 0;
+}
+
void reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
- int i;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
- /* At least one of the sk in this reuseport group is added to
- * a bpf map. Notify the bpf side. The bpf map logic will
- * remove the sk if it is indeed added to a bpf map.
+ /* reuseport_grow() has detached a closed sk */
+ if (!reuse)
+ goto out;
+
+ /* Notify the bpf side. The sk may be added to a sockarray
+ * map. If so, sockarray logic will remove it from the map.
+ *
+ * Other bpf map types that work with reuseport, like sockmap,
+ * don't need an explicit callback from here. They override sk
+ * unhash/close ops to remove the sk from the map before we
+ * get to this point.
*/
- if (reuse->reuseport_id)
- bpf_sk_reuseport_detach(sk);
+ bpf_sk_reuseport_detach(sk);
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
- for (i = 0; i < reuse->num_socks; i++) {
- if (reuse->socks[i] == sk) {
- reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
- reuse->num_socks--;
- if (reuse->num_socks == 0)
- call_rcu(&reuse->rcu, reuseport_free_rcu);
- break;
- }
- }
+ if (!__reuseport_detach_closed_sock(sk, reuse))
+ __reuseport_detach_sock(sk, reuse);
+
+ if (reuse->num_socks + reuse->num_closed_socks == 0)
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
+
+out:
spin_unlock_bh(&reuseport_lock);
}
EXPORT_SYMBOL(reuseport_detach_sock);
+void reuseport_stop_listen_sock(struct sock *sk)
+{
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
+
+ spin_lock_bh(&reuseport_lock);
+
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) ||
+ (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
+ /* Migration capable, move sk from the listening section
+ * to the closed section.
+ */
+ bpf_sk_reuseport_detach(sk);
+
+ __reuseport_detach_sock(sk, reuse);
+ __reuseport_add_closed_sock(sk, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+ return;
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+ }
+
+ /* Not capable to do migration, detach immediately */
+ reuseport_detach_sock(sk);
+}
+EXPORT_SYMBOL(reuseport_stop_listen_sock);
+
static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
struct bpf_prog *prog, struct sk_buff *skb,
int hdr_len)
@@ -251,6 +455,23 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
return reuse->socks[index];
}
+static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
+ u32 hash, u16 num_socks)
+{
+ int i, j;
+
+ i = j = reciprocal_scale(hash, num_socks);
+ while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+ i++;
+ if (i >= num_socks)
+ i = 0;
+ if (i == j)
+ return NULL;
+ }
+
+ return reuse->socks[i];
+}
+
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
@@ -281,32 +502,21 @@ struct sock *reuseport_select_sock(struct sock *sk,
prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
if (likely(socks)) {
- /* paired with smp_wmb() in reuseport_add_sock() */
+ /* paired with smp_wmb() in __reuseport_add_sock() */
smp_rmb();
if (!prog || !skb)
goto select_by_hash;
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
- sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
else
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
- if (!sk2) {
- int i, j;
-
- i = j = reciprocal_scale(hash, socks);
- while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
- i++;
- if (i >= reuse->num_socks)
- i = 0;
- if (i == j)
- goto out;
- }
- sk2 = reuse->socks[i];
- }
+ if (!sk2)
+ sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
}
out:
@@ -315,14 +525,90 @@ out:
}
EXPORT_SYMBOL(reuseport_select_sock);
+/**
+ * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
+ * @sk: close()ed or shutdown()ed socket in the group.
+ * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
+ * NEW_SYN_RECV request socket during 3WHS.
+ * @skb: skb to run through BPF filter.
+ * Returns a socket (with sk_refcnt +1) that should accept the child socket
+ * (or NULL on error).
+ */
+struct sock *reuseport_migrate_sock(struct sock *sk,
+ struct sock *migrating_sk,
+ struct sk_buff *skb)
+{
+ struct sock_reuseport *reuse;
+ struct sock *nsk = NULL;
+ bool allocated = false;
+ struct bpf_prog *prog;
+ u16 socks;
+ u32 hash;
+
+ rcu_read_lock();
+
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+ if (!reuse)
+ goto out;
+
+ socks = READ_ONCE(reuse->num_socks);
+ if (unlikely(!socks))
+ goto failure;
+
+ /* paired with smp_wmb() in __reuseport_add_sock() */
+ smp_rmb();
+
+ hash = migrating_sk->sk_hash;
+ prog = rcu_dereference(reuse->prog);
+ if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req))
+ goto select_by_hash;
+ goto failure;
+ }
+
+ if (!skb) {
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ goto failure;
+ allocated = true;
+ }
+
+ nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
+
+ if (allocated)
+ kfree_skb(skb);
+
+select_by_hash:
+ if (!nsk)
+ nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
+
+ if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
+ nsk = NULL;
+ goto failure;
+ }
+
+out:
+ rcu_read_unlock();
+ return nsk;
+
+failure:
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+ goto out;
+}
+EXPORT_SYMBOL(reuseport_migrate_sock);
+
int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
- if (sk_unhashed(sk) && sk->sk_reuseport) {
- int err = reuseport_alloc(sk, false);
+ if (sk_unhashed(sk)) {
+ int err;
+
+ if (!sk->sk_reuseport)
+ return -EINVAL;
+ err = reuseport_alloc(sk, false);
if (err)
return err;
} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
@@ -348,13 +634,24 @@ int reuseport_detach_prog(struct sock *sk)
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
- if (!rcu_access_pointer(sk->sk_reuseport_cb))
- return sk->sk_reuseport ? -ENOENT : -EINVAL;
-
old_prog = NULL;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
+
+ /* reuse must be checked after acquiring the reuseport_lock
+ * because reuseport_grow() can detach a closed sk.
+ */
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return sk->sk_reuseport ? -ENOENT : -EINVAL;
+ }
+
+ if (sk_unhashed(sk) && reuse->num_closed_socks) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOENT;
+ }
+
old_prog = rcu_replace_pointer(reuse->prog, old_prog,
lockdep_is_held(&reuseport_lock));
spin_unlock_bh(&reuseport_lock);
diff --git a/net/core/stream.c b/net/core/stream.c
index 4f1d4aa5fb38..75fded8495f5 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -123,7 +123,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
- current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+ current_timeo = vm_wait = prandom_u32_max(HZ / 5) + 2;
add_wait_queue(sk_sleep(sk), &wait);
@@ -159,7 +159,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
*timeo_p = current_timeo;
}
out:
- remove_wait_queue(sk_sleep(sk), &wait);
+ if (!sock_flag(sk, SOCK_DEAD))
+ remove_wait_queue(sk_sleep(sk), &wait);
return err;
do_error:
@@ -195,17 +196,14 @@ void sk_stream_kill_queues(struct sock *sk)
/* First the read buffer. */
__skb_queue_purge(&sk->sk_receive_queue);
- /* Next, the error queue. */
- __skb_queue_purge(&sk->sk_error_queue);
-
/* Next, the write queue. */
- WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+ WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue));
/* Account for returned memory. */
- sk_mem_reclaim(sk);
+ sk_mem_reclaim_final(sk);
- WARN_ON(sk->sk_wmem_queued);
- WARN_ON(sk->sk_forward_alloc);
+ WARN_ON_ONCE(sk->sk_wmem_queued);
+ WARN_ON_ONCE(sk->sk_forward_alloc);
/* It is _impossible_ for the backlog to contain anything
* when we get here. All user references to this socket
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 9f9e00ba3ad7..5b1ce656baa1 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -6,6 +6,7 @@
* Added /proc/sys/net/core directory entry (empty =) ). [MS]
*/
+#include <linux/filter.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/module.h>
@@ -22,12 +23,12 @@
#include <net/busy_poll.h>
#include <net/pkt_sched.h>
-static int two __maybe_unused = 2;
+#include "dev.h"
+
+static int int_3600 = 3600;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
-static long long_one __maybe_unused = 1;
-static long long_max __maybe_unused = LONG_MAX;
static int net_msg_warn; /* Unused, but still a sysctl */
@@ -39,13 +40,14 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
* IPv6: reset all settings to default
* 1 - Both inherit all current settings from init_net
* 2 - Both reset all settings to default
+ * 3 - Both inherit all settings from current netns
*/
int sysctl_devconf_inherit_init_net __read_mostly;
EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int orig_size, size;
int ret, i;
@@ -99,8 +101,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
if (orig_sock_table) {
static_branch_dec(&rps_needed);
static_branch_dec(&rfs_needed);
- synchronize_rcu();
- vfree(orig_sock_table);
+ kvfree_rcu(orig_sock_table);
}
}
}
@@ -115,8 +116,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
static DEFINE_MUTEX(flow_limit_update_mutex);
static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct sd_flow_limit *cur;
struct softnet_data *sd;
@@ -127,7 +127,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
return -ENOMEM;
if (write) {
- ret = cpumask_parse_user(buffer, *lenp, mask);
+ ret = cpumask_parse(buffer, mask);
if (ret)
goto done;
@@ -139,8 +139,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
lockdep_is_held(&flow_limit_update_mutex));
if (cur && !cpumask_test_cpu(i, mask)) {
RCU_INIT_POINTER(sd->flow_limit, NULL);
- synchronize_rcu();
- kfree(cur);
+ kfree_rcu(cur);
} else if (!cur && cpumask_test_cpu(i, mask)) {
cur = kzalloc_node(len, GFP_KERNEL,
cpu_to_node(i));
@@ -180,10 +179,7 @@ write_unlock:
}
if (len < *lenp)
kbuf[len++] = '\n';
- if (copy_to_user(buffer, kbuf, len)) {
- ret = -EFAULT;
- goto done;
- }
+ memcpy(buffer, kbuf, len);
*lenp = len;
*ppos += len;
}
@@ -194,8 +190,7 @@ done:
}
static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int old, *ptr;
int ret;
@@ -217,7 +212,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_NET_SCHED
static int set_default_qdisc(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char id[IFNAMSIZ];
struct ctl_table tbl = {
@@ -236,22 +231,25 @@ static int set_default_qdisc(struct ctl_table *table, int write,
#endif
static int proc_do_dev_weight(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret;
+ static DEFINE_MUTEX(dev_weight_mutex);
+ int ret, weight;
+ mutex_lock(&dev_weight_mutex);
ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret != 0)
- return ret;
-
- dev_rx_weight = weight_p * dev_weight_rx_bias;
- dev_tx_weight = weight_p * dev_weight_tx_bias;
+ if (!ret && write) {
+ weight = READ_ONCE(weight_p);
+ WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias);
+ WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias);
+ }
+ mutex_unlock(&dev_weight_mutex);
return ret;
}
static int proc_do_rss_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table fake_table;
char buf[NETDEV_RSS_KEY_LEN * 3];
@@ -264,10 +262,12 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
#ifdef CONFIG_BPF_JIT
static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret, jit_enable = *(int *)table->data;
+ int min = *(int *)table->extra1;
+ int max = *(int *)table->extra2;
struct ctl_table tmp = *table;
if (write && !capable(CAP_SYS_ADMIN))
@@ -277,7 +277,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && !ret) {
if (jit_enable < 2 ||
- (jit_enable == 2 && bpf_dump_raw_ok())) {
+ (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) {
*(int *)table->data = jit_enable;
if (jit_enable == 2)
pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
@@ -285,14 +285,17 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
ret = -EPERM;
}
}
+
+ if (write && ret && min == max)
+ pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n");
+
return ret;
}
# ifdef CONFIG_HAVE_EBPF_JIT
static int
proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -303,8 +306,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
static int
proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -314,7 +316,6 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
#endif
static struct ctl_table net_core_table[] = {
-#ifdef CONFIG_NET
{
.procname = "wmem_max",
.data = &sysctl_wmem_max,
@@ -394,7 +395,7 @@ static struct ctl_table net_core_table[] = {
.extra2 = SYSCTL_ONE,
# else
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
+ .extra2 = SYSCTL_TWO,
# endif
},
# ifdef CONFIG_HAVE_EBPF_JIT
@@ -405,7 +406,7 @@ static struct ctl_table net_core_table[] = {
.mode = 0600,
.proc_handler = proc_dointvec_minmax_bpf_restricted,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "bpf_jit_kallsyms",
@@ -423,8 +424,8 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(long),
.mode = 0600,
.proc_handler = proc_dolongvec_minmax_bpf_restricted,
- .extra1 = &long_one,
- .extra2 = &long_max,
+ .extra1 = SYSCTL_LONG_ONE,
+ .extra2 = &bpf_jit_limit_max,
},
#endif
{
@@ -512,7 +513,6 @@ static struct ctl_table net_core_table[] = {
.proc_handler = set_default_qdisc
},
#endif
-#endif /* CONFIG_NET */
{
.procname = "netdev_budget",
.data = &netdev_budget,
@@ -551,7 +551,7 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "devconf_inherit_init_net",
@@ -560,7 +560,7 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = &two,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "high_order_alloc_disable",
@@ -577,6 +577,23 @@ static struct ctl_table net_core_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
+ {
+ .procname = "netdev_unregister_timeout_secs",
+ .data = &netdev_unregister_timeout_secs,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &int_3600,
+ },
+ {
+ .procname = "skb_defer_max",
+ .data = &sysctl_skb_defer_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
{ }
};
@@ -589,12 +606,34 @@ static struct ctl_table netns_core_table[] = {
.extra1 = SYSCTL_ZERO,
.proc_handler = proc_dointvec_minmax
},
+ {
+ .procname = "txrehash",
+ .data = &init_net.core.sysctl_txrehash,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ .proc_handler = proc_dou8vec_minmax,
+ },
{ }
};
+static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str)
+{
+ /* fallback tunnels for initns only */
+ if (!strncmp(str, "initns", 6))
+ sysctl_fb_tunnels_only_for_init_net = 1;
+ /* no fallback tunnels anywhere */
+ else if (!strncmp(str, "none", 4))
+ sysctl_fb_tunnels_only_for_init_net = 2;
+
+ return 1;
+}
+__setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
+
static __net_init int sysctl_core_net_init(struct net *net)
{
- struct ctl_table *tbl;
+ struct ctl_table *tbl, *tmp;
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
@@ -602,7 +641,8 @@ static __net_init int sysctl_core_net_init(struct net *net)
if (tbl == NULL)
goto err_dup;
- tbl[0].data = &net->core.sysctl_somaxconn;
+ for (tmp = tbl; tmp->procname; tmp++)
+ tmp->data += (char *)net - (char *)&init_net;
/* Don't export any sysctls to unprivileged users */
if (net->user_ns != &init_user_ns) {
diff --git a/net/core/tso.c b/net/core/tso.c
index d4d5c077ad72..4148f6d48953 100644
--- a/net/core/tso.c
+++ b/net/core/tso.c
@@ -6,18 +6,17 @@
#include <asm/unaligned.h>
/* Calculate expected number of TX descriptors */
-int tso_count_descs(struct sk_buff *skb)
+int tso_count_descs(const struct sk_buff *skb)
{
/* The Marvell Way */
return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
}
EXPORT_SYMBOL(tso_count_descs);
-void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
+void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
int size, bool is_last)
{
- struct tcphdr *tcph;
- int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int hdr_len = skb_transport_offset(skb) + tso->tlen;
int mac_hdr_len = skb_network_offset(skb);
memcpy(hdr, skb->data, hdr_len);
@@ -30,23 +29,31 @@ void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
} else {
struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len);
- iph->payload_len = htons(size + tcp_hdrlen(skb));
+ iph->payload_len = htons(size + tso->tlen);
}
- tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
- put_unaligned_be32(tso->tcp_seq, &tcph->seq);
+ hdr += skb_transport_offset(skb);
+ if (tso->tlen != sizeof(struct udphdr)) {
+ struct tcphdr *tcph = (struct tcphdr *)hdr;
- if (!is_last) {
- /* Clear all special flags for not last packet */
- tcph->psh = 0;
- tcph->fin = 0;
- tcph->rst = 0;
+ put_unaligned_be32(tso->tcp_seq, &tcph->seq);
+
+ if (!is_last) {
+ /* Clear all special flags for not last packet */
+ tcph->psh = 0;
+ tcph->fin = 0;
+ tcph->rst = 0;
+ }
+ } else {
+ struct udphdr *uh = (struct udphdr *)hdr;
+
+ uh->len = htons(sizeof(*uh) + size);
}
}
EXPORT_SYMBOL(tso_build_hdr);
-void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
+void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size)
{
- tso->tcp_seq += size;
+ tso->tcp_seq += size; /* not worth avoiding this operation for UDP */
tso->size -= size;
tso->data += size;
@@ -62,12 +69,14 @@ void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
}
EXPORT_SYMBOL(tso_build_data);
-void tso_start(struct sk_buff *skb, struct tso_t *tso)
+int tso_start(struct sk_buff *skb, struct tso_t *tso)
{
- int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int tlen = skb_is_gso_tcp(skb) ? tcp_hdrlen(skb) : sizeof(struct udphdr);
+ int hdr_len = skb_transport_offset(skb) + tlen;
+ tso->tlen = tlen;
tso->ip_id = ntohs(ip_hdr(skb)->id);
- tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ tso->tcp_seq = (tlen != sizeof(struct udphdr)) ? ntohl(tcp_hdr(skb)->seq) : 0;
tso->next_frag_idx = 0;
tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6);
@@ -83,5 +92,6 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso)
tso->data = skb_frag_address(frag);
tso->next_frag_idx++;
}
+ return hdr_len;
}
EXPORT_SYMBOL(tso_start);
diff --git a/net/core/utils.c b/net/core/utils.c
index 1f31a39236d5..938495bc1d34 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -476,9 +476,9 @@ void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
__wsum diff, bool pseudohdr)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
+ csum_replace_by_diff(sum, diff);
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_add(diff, ~skb->csum);
+ skb->csum = ~csum_sub(diff, skb->csum);
} else if (pseudohdr) {
*sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8310714c47fd..844c9d99dc0e 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -11,11 +11,13 @@
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
+#include <linux/bug.h>
#include <net/page_pool.h>
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
+#include <net/xdp_sock_drv.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
@@ -108,49 +110,36 @@ static void mem_allocator_disconnect(void *allocator)
mutex_unlock(&mem_id_lock);
}
-static void mem_id_disconnect(int id)
+void xdp_unreg_mem_model(struct xdp_mem_info *mem)
{
struct xdp_mem_allocator *xa;
+ int type = mem->type;
+ int id = mem->id;
- mutex_lock(&mem_id_lock);
+ /* Reset mem info to defaults */
+ mem->id = 0;
+ mem->type = 0;
- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (!xa) {
- mutex_unlock(&mem_id_lock);
- WARN(1, "Request remove non-existing id(%d), driver bug?", id);
+ if (id == 0)
return;
- }
- trace_mem_disconnect(xa);
-
- if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
-
- mutex_unlock(&mem_id_lock);
+ if (type == MEM_TYPE_PAGE_POOL) {
+ rcu_read_lock();
+ xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+ page_pool_destroy(xa->page_pool);
+ rcu_read_unlock();
+ }
}
+EXPORT_SYMBOL_GPL(xdp_unreg_mem_model);
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
- struct xdp_mem_allocator *xa;
- int id = xdp_rxq->mem.id;
-
if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
WARN(1, "Missing register, driver bug");
return;
}
- if (id == 0)
- return;
-
- if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
- return mem_id_disconnect(id);
-
- if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
- rcu_read_lock();
- xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
- page_pool_destroy(xa->page_pool);
- rcu_read_unlock();
- }
+ xdp_unreg_mem_model(&xdp_rxq->mem);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
@@ -160,16 +149,10 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
if (xdp_rxq->reg_state == REG_STATE_UNUSED)
return;
- WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
-
xdp_rxq_info_unreg_mem_model(xdp_rxq);
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
-
- /* Reset mem info to defaults */
- xdp_rxq->mem.id = 0;
- xdp_rxq->mem.type = 0;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
@@ -179,9 +162,15 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
}
/* Returns 0 on success, negative on failure */
-int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
- struct net_device *dev, u32 queue_index)
+int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+ struct net_device *dev, u32 queue_index,
+ unsigned int napi_id, u32 frag_size)
{
+ if (!dev) {
+ WARN(1, "Missing net_device from driver");
+ return -ENODEV;
+ }
+
if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
WARN(1, "Driver promised not to register this");
return -EINVAL;
@@ -192,20 +181,17 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
xdp_rxq_info_unreg(xdp_rxq);
}
- if (!dev) {
- WARN(1, "Missing net_device from driver");
- return -ENODEV;
- }
-
/* State either UNREGISTERED or NEW */
xdp_rxq_info_init(xdp_rxq);
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
+ xdp_rxq->napi_id = napi_id;
+ xdp_rxq->frag_size = frag_size;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
return 0;
}
-EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
+EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
{
@@ -281,28 +267,24 @@ static bool __is_supported_mem_type(enum xdp_mem_type type)
return true;
}
-int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
- enum xdp_mem_type type, void *allocator)
+static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
+ enum xdp_mem_type type,
+ void *allocator)
{
struct xdp_mem_allocator *xdp_alloc;
gfp_t gfp = GFP_KERNEL;
int id, errno, ret;
void *ptr;
- if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
- WARN(1, "Missing register, driver bug");
- return -EFAULT;
- }
-
if (!__is_supported_mem_type(type))
- return -EOPNOTSUPP;
+ return ERR_PTR(-EOPNOTSUPP);
- xdp_rxq->mem.type = type;
+ mem->type = type;
if (!allocator) {
- if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
- return -EINVAL; /* Setup time check page_pool req */
- return 0;
+ if (type == MEM_TYPE_PAGE_POOL)
+ return ERR_PTR(-EINVAL); /* Setup time check page_pool req */
+ return NULL;
}
/* Delay init of rhashtable to save memory if feature isn't used */
@@ -312,13 +294,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
mutex_unlock(&mem_id_lock);
if (ret < 0) {
WARN_ON(1);
- return ret;
+ return ERR_PTR(ret);
}
}
xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
if (!xdp_alloc)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
mutex_lock(&mem_id_lock);
id = __mem_id_cyclic_get(gfp);
@@ -326,31 +308,62 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
errno = id;
goto err;
}
- xdp_rxq->mem.id = id;
- xdp_alloc->mem = xdp_rxq->mem;
+ mem->id = id;
+ xdp_alloc->mem = *mem;
xdp_alloc->allocator = allocator;
/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
if (IS_ERR(ptr)) {
- ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id);
- xdp_rxq->mem.id = 0;
+ ida_simple_remove(&mem_id_pool, mem->id);
+ mem->id = 0;
errno = PTR_ERR(ptr);
goto err;
}
if (type == MEM_TYPE_PAGE_POOL)
- page_pool_use_xdp_mem(allocator, mem_allocator_disconnect);
+ page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem);
mutex_unlock(&mem_id_lock);
- trace_mem_connect(xdp_alloc, xdp_rxq);
- return 0;
+ return xdp_alloc;
err:
mutex_unlock(&mem_id_lock);
kfree(xdp_alloc);
- return errno;
+ return ERR_PTR(errno);
+}
+
+int xdp_reg_mem_model(struct xdp_mem_info *mem,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+
+ xdp_alloc = __xdp_reg_mem_model(mem, type, allocator);
+ if (IS_ERR(xdp_alloc))
+ return PTR_ERR(xdp_alloc);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_reg_mem_model);
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
+ return -EFAULT;
+ }
+
+ xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator);
+ if (IS_ERR(xdp_alloc))
+ return PTR_ERR(xdp_alloc);
+
+ if (trace_mem_connect_enabled() && xdp_alloc)
+ trace_mem_connect(xdp_alloc, xdp_rxq);
+ return 0;
}
+
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
/* XDP RX runs under NAPI protection, and in different delivery error
@@ -359,21 +372,20 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- unsigned long handle)
+void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
+ struct xdp_buff *xdp)
{
- struct xdp_mem_allocator *xa;
struct page *page;
switch (mem->type) {
case MEM_TYPE_PAGE_POOL:
- rcu_read_lock();
- /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
- napi_direct &= !xdp_return_frame_no_direct();
- page_pool_put_page(xa->page_pool, page, napi_direct);
- rcu_read_unlock();
+ if (napi_direct && xdp_return_frame_no_direct())
+ napi_direct = false;
+ /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
+ * as mem->type knows this a page_pool page
+ */
+ page_pool_put_full_page(page->pp, page, napi_direct);
break;
case MEM_TYPE_PAGE_SHARED:
page_frag_free(data);
@@ -382,34 +394,138 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
page = virt_to_page(data); /* Assumes order0 page*/
put_page(page);
break;
- case MEM_TYPE_ZERO_COPY:
+ case MEM_TYPE_XSK_BUFF_POOL:
/* NB! Only valid from an xdp_buff! */
- rcu_read_lock();
- /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- xa->zc_alloc->free(xa->zc_alloc, handle);
- rcu_read_unlock();
+ xsk_buff_free(xdp);
+ break;
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
break;
}
}
void xdp_return_frame(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+ struct skb_shared_info *sinfo;
+ int i;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+ __xdp_return(page_address(page), &xdpf->mem, false, NULL);
+ }
+out:
+ __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+ struct skb_shared_info *sinfo;
+ int i;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+ __xdp_return(page_address(page), &xdpf->mem, true, NULL);
+ }
+out:
+ __xdp_return(xdpf->data, &xdpf->mem, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+/* XDP bulk APIs introduce a defer/flush mechanism to return
+ * pages belonging to the same xdp_mem_allocator object
+ * (identified via the mem.id field) in bulk to optimize
+ * I-cache and D-cache.
+ * The bulk queue size is set to 16 to be aligned to how
+ * XDP_REDIRECT bulking works. The bulk is flushed when
+ * it is full or when mem.id changes.
+ * xdp_frame_bulk is usually stored/allocated on the function
+ * call-stack to avoid locking penalties.
+ */
+void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
+{
+ struct xdp_mem_allocator *xa = bq->xa;
+
+ if (unlikely(!xa || !bq->count))
+ return;
+
+ page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count);
+ /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */
+ bq->count = 0;
+}
+EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
+
+/* Must be called with rcu_read_lock held */
+void xdp_return_frame_bulk(struct xdp_frame *xdpf,
+ struct xdp_frame_bulk *bq)
+{
+ struct xdp_mem_info *mem = &xdpf->mem;
+ struct xdp_mem_allocator *xa;
+
+ if (mem->type != MEM_TYPE_PAGE_POOL) {
+ xdp_return_frame(xdpf);
+ return;
+ }
+
+ xa = bq->xa;
+ if (unlikely(!xa)) {
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ bq->count = 0;
+ bq->xa = xa;
+ }
+
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
+ xdp_flush_frame_bulk(bq);
+
+ if (unlikely(mem->id != xa->mem.id)) {
+ xdp_flush_frame_bulk(bq);
+ bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ }
+
+ if (unlikely(xdp_frame_has_frags(xdpf))) {
+ struct skb_shared_info *sinfo;
+ int i;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+
+ bq->q[bq->count++] = skb_frag_address(frag);
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
+ xdp_flush_frame_bulk(bq);
+ }
+ }
+ bq->q[bq->count++] = xdpf->data;
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
+
void xdp_return_buff(struct xdp_buff *xdp)
{
- __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+ struct skb_shared_info *sinfo;
+ int i;
+
+ if (likely(!xdp_buff_has_frags(xdp)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
+
+ __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
+ }
+out:
+ __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
@@ -428,27 +544,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
}
EXPORT_SYMBOL_GPL(__xdp_release_frame);
-int xdp_attachment_query(struct xdp_attachment_info *info,
- struct netdev_bpf *bpf)
-{
- bpf->prog_id = info->prog ? info->prog->aux->id : 0;
- bpf->prog_flags = info->prog ? info->flags : 0;
- return 0;
-}
-EXPORT_SYMBOL_GPL(xdp_attachment_query);
-
-bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
- struct netdev_bpf *bpf)
-{
- if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) {
- NL_SET_ERR_MSG(bpf->extack,
- "program loaded with different flags");
- return false;
- }
- return true;
-}
-EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok);
-
void xdp_attachment_setup(struct xdp_attachment_info *info,
struct netdev_bpf *bpf)
{
@@ -490,9 +585,127 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->len = totsize - metasize;
xdpf->headroom = 0;
xdpf->metasize = metasize;
+ xdpf->frame_sz = PAGE_SIZE;
xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
- xdp_return_buff(xdp);
+ xsk_buff_free(xdp);
return xdpf;
}
EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
+
+/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */
+void xdp_warn(const char *msg, const char *func, const int line)
+{
+ WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg);
+};
+EXPORT_SYMBOL_GPL(xdp_warn);
+
+int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
+{
+ n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp,
+ n_skb, skbs);
+ if (unlikely(!n_skb))
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk);
+
+struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
+ struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
+ unsigned int headroom, frame_size;
+ void *hard_start;
+ u8 nr_frags;
+
+ /* xdp frags frame */
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ nr_frags = sinfo->nr_frags;
+
+ /* Part of headroom was reserved to xdpf */
+ headroom = sizeof(*xdpf) + xdpf->headroom;
+
+ /* Memory size backing xdp_frame data already have reserved
+ * room for build_skb to place skb_shared_info in tailroom.
+ */
+ frame_size = xdpf->frame_sz;
+
+ hard_start = xdpf->data - headroom;
+ skb = build_skb_around(skb, hard_start, frame_size);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, headroom);
+ __skb_put(skb, xdpf->len);
+ if (xdpf->metasize)
+ skb_metadata_set(skb, xdpf->metasize);
+
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ xdp_update_skb_shared_info(skb, nr_frags,
+ sinfo->xdp_frags_size,
+ nr_frags * xdpf->frame_sz,
+ xdp_frame_is_frag_pfmemalloc(xdpf));
+
+ /* Essential SKB info: protocol and skb->dev */
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* Optional SKB info, currently missing:
+ * - HW checksum info (skb->ip_summed)
+ * - HW RX hash (skb_set_hash)
+ * - RX ring dev queue index (skb_record_rx_queue)
+ */
+
+ /* Until page_pool get SKB return path, release DMA here */
+ xdp_release_frame(xdpf);
+
+ /* Allow SKB to reuse area used by xdp_frame */
+ xdp_scrub_frame(xdpf);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame);
+
+struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
+ struct net_device *dev)
+{
+ struct sk_buff *skb;
+
+ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+
+ return __xdp_build_skb_from_frame(xdpf, skb, dev);
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
+
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
+{
+ unsigned int headroom, totalsize;
+ struct xdp_frame *nxdpf;
+ struct page *page;
+ void *addr;
+
+ headroom = xdpf->headroom + sizeof(*xdpf);
+ totalsize = headroom + xdpf->len;
+
+ if (unlikely(totalsize > PAGE_SIZE))
+ return NULL;
+ page = dev_alloc_page();
+ if (!page)
+ return NULL;
+ addr = page_to_virt(page);
+
+ memcpy(addr, xdpf, totalsize);
+
+ nxdpf = addr;
+ nxdpf->data = addr + headroom;
+ nxdpf->frame_sz = PAGE_SIZE;
+ nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+ nxdpf->mem.id = 0;
+
+ return nxdpf;
+}