aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/bpf_sk_storage.c4
-rw-r--r--net/core/dev.c380
-rw-r--r--net/core/dev_addr_lists.c144
-rw-r--r--net/core/dev_ioctl.c264
-rw-r--r--net/core/devlink.c690
-rw-r--r--net/core/drop_monitor.c6
-rw-r--r--net/core/dst.c6
-rw-r--r--net/core/fib_rules.c4
-rw-r--r--net/core/filter.c134
-rw-r--r--net/core/flow_dissector.c30
-rw-r--r--net/core/flow_offload.c90
-rw-r--r--net/core/link_watch.c5
-rw-r--r--net/core/lwtunnel.c5
-rw-r--r--net/core/neighbour.c29
-rw-r--r--net/core/net-procfs.c24
-rw-r--r--net/core/net_namespace.c52
-rw-r--r--net/core/page_pool.c124
-rw-r--r--net/core/pktgen.c167
-rw-r--r--net/core/ptp_classifier.c2
-rw-r--r--net/core/rtnetlink.c34
-rw-r--r--net/core/scm.c4
-rw-r--r--net/core/selftests.c12
-rw-r--r--net/core/skbuff.c92
-rw-r--r--net/core/skmsg.c55
-rw-r--r--net/core/sock.c102
-rw-r--r--net/core/sock_map.c23
27 files changed, 1726 insertions, 758 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index f7f16650fe9e..35ced6201814 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -33,8 +33,6 @@ obj-$(CONFIG_HWBM) += hwbm.o
obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
obj-$(CONFIG_FAILOVER) += failover.o
-ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
-endif
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index f564f82e91d9..68d2cbf8331a 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -416,7 +416,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{
- if (in_irq() || in_nmi())
+ if (in_hardirq() || in_nmi())
return (unsigned long)NULL;
return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
@@ -425,7 +425,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
struct sock *, sk)
{
- if (in_irq() || in_nmi())
+ if (in_hardirq() || in_nmi())
return -EPERM;
return ____bpf_sk_storage_delete(map, sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index c253c2aafe97..74fd402d26dd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -131,6 +131,7 @@
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
+#include <trace/events/qdisc.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
@@ -675,131 +676,6 @@ void dev_remove_offload(struct packet_offload *po)
}
EXPORT_SYMBOL(dev_remove_offload);
-/******************************************************************************
- *
- * Device Boot-time Settings Routines
- *
- ******************************************************************************/
-
-/* Boot time configuration table */
-static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
-
-/**
- * netdev_boot_setup_add - add new setup entry
- * @name: name of the device
- * @map: configured settings for the device
- *
- * Adds new setup entry to the dev_boot_setup list. The function
- * returns 0 on error and 1 on success. This is a generic routine to
- * all netdevices.
- */
-static int netdev_boot_setup_add(char *name, struct ifmap *map)
-{
- struct netdev_boot_setup *s;
- int i;
-
- s = dev_boot_setup;
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
- if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
- memset(s[i].name, 0, sizeof(s[i].name));
- strlcpy(s[i].name, name, IFNAMSIZ);
- memcpy(&s[i].map, map, sizeof(s[i].map));
- break;
- }
- }
-
- return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
-}
-
-/**
- * netdev_boot_setup_check - check boot time settings
- * @dev: the netdevice
- *
- * Check boot time settings for the device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found, 1 if they are.
- */
-int netdev_boot_setup_check(struct net_device *dev)
-{
- struct netdev_boot_setup *s = dev_boot_setup;
- int i;
-
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
- if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
- !strcmp(dev->name, s[i].name)) {
- dev->irq = s[i].map.irq;
- dev->base_addr = s[i].map.base_addr;
- dev->mem_start = s[i].map.mem_start;
- dev->mem_end = s[i].map.mem_end;
- return 1;
- }
- }
- return 0;
-}
-EXPORT_SYMBOL(netdev_boot_setup_check);
-
-
-/**
- * netdev_boot_base - get address from boot time settings
- * @prefix: prefix for network device
- * @unit: id for network device
- *
- * Check boot time settings for the base address of device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found.
- */
-unsigned long netdev_boot_base(const char *prefix, int unit)
-{
- const struct netdev_boot_setup *s = dev_boot_setup;
- char name[IFNAMSIZ];
- int i;
-
- sprintf(name, "%s%d", prefix, unit);
-
- /*
- * If device already registered then return base of 1
- * to indicate not to probe for this interface
- */
- if (__dev_get_by_name(&init_net, name))
- return 1;
-
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
- if (!strcmp(name, s[i].name))
- return s[i].map.base_addr;
- return 0;
-}
-
-/*
- * Saves at boot time configured settings for any netdevice.
- */
-int __init netdev_boot_setup(char *str)
-{
- int ints[5];
- struct ifmap map;
-
- str = get_options(str, ARRAY_SIZE(ints), ints);
- if (!str || !*str)
- return 0;
-
- /* Save settings */
- memset(&map, 0, sizeof(map));
- if (ints[0] > 0)
- map.irq = ints[1];
- if (ints[0] > 1)
- map.base_addr = ints[2];
- if (ints[0] > 2)
- map.mem_start = ints[3];
- if (ints[0] > 3)
- map.mem_end = ints[4];
-
- /* Add new entry to the list */
- return netdev_boot_setup_add(str, &map);
-}
-
-__setup("netdev=", netdev_boot_setup);
-
/*******************************************************************************
*
* Device Interface Subroutines
@@ -955,8 +831,7 @@ struct net_device *dev_get_by_name(struct net *net, const char *name)
rcu_read_lock();
dev = dev_get_by_name_rcu(net, name);
- if (dev)
- dev_hold(dev);
+ dev_hold(dev);
rcu_read_unlock();
return dev;
}
@@ -1029,8 +904,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex)
rcu_read_lock();
dev = dev_get_by_index_rcu(net, ifindex);
- if (dev)
- dev_hold(dev);
+ dev_hold(dev);
rcu_read_unlock();
return dev;
}
@@ -3098,6 +2972,50 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues);
#endif
/**
+ * netif_set_real_num_queues - set actual number of RX and TX queues used
+ * @dev: Network device
+ * @txq: Actual number of TX queues
+ * @rxq: Actual number of RX queues
+ *
+ * Set the real number of both TX and RX queues.
+ * Does nothing if the number of queues is already correct.
+ */
+int netif_set_real_num_queues(struct net_device *dev,
+ unsigned int txq, unsigned int rxq)
+{
+ unsigned int old_rxq = dev->real_num_rx_queues;
+ int err;
+
+ if (txq < 1 || txq > dev->num_tx_queues ||
+ rxq < 1 || rxq > dev->num_rx_queues)
+ return -EINVAL;
+
+ /* Start from increases, so the error path only does decreases -
+ * decreases can't fail.
+ */
+ if (rxq > dev->real_num_rx_queues) {
+ err = netif_set_real_num_rx_queues(dev, rxq);
+ if (err)
+ return err;
+ }
+ if (txq > dev->real_num_tx_queues) {
+ err = netif_set_real_num_tx_queues(dev, txq);
+ if (err)
+ goto undo_rx;
+ }
+ if (rxq < dev->real_num_rx_queues)
+ WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
+ if (txq < dev->real_num_tx_queues)
+ WARN_ON(netif_set_real_num_tx_queues(dev, txq));
+
+ return 0;
+undo_rx:
+ WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
+ return err;
+}
+EXPORT_SYMBOL(netif_set_real_num_queues);
+
+/**
* netif_get_num_default_rss_queues - default number of RSS queues
*
* This routine should set an upper limit on the number of RSS queues
@@ -3189,7 +3107,7 @@ EXPORT_SYMBOL(__dev_kfree_skb_irq);
void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
{
- if (in_irq() || irqs_disabled())
+ if (in_hardirq() || irqs_disabled())
__dev_kfree_skb_irq(skb, reason);
else
dev_kfree_skb(skb);
@@ -3844,6 +3762,18 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
}
}
+static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
+ struct sk_buff **to_free,
+ struct netdev_queue *txq)
+{
+ int rc;
+
+ rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
+ if (rc == NET_XMIT_SUCCESS)
+ trace_qdisc_enqueue(q, txq, skb);
+ return rc;
+}
+
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
@@ -3862,8 +3792,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
* of q->seqlock to protect from racing with requeuing.
*/
if (unlikely(!nolock_qdisc_is_empty(q))) {
- rc = q->enqueue(skb, q, &to_free) &
- NET_XMIT_MASK;
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
__qdisc_run(q);
qdisc_run_end(q);
@@ -3879,7 +3808,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
return NET_XMIT_SUCCESS;
}
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
qdisc_run(q);
no_lock_out:
@@ -3923,7 +3852,7 @@ no_lock_out:
qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
spin_unlock(&q->busylock);
@@ -4000,7 +3929,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
qdisc_skb_cb(skb)->post_ct = false;
mini_qdisc_bstats_cpu_update(miniq, skb);
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+ switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
@@ -4744,45 +4673,18 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
return rxqueue;
}
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
- struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
{
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
- u32 metalen, act = XDP_DROP;
bool orig_bcast, orig_host;
u32 mac_len, frame_sz;
__be16 orig_eth_type;
struct ethhdr *eth;
+ u32 metalen, act;
int off;
- /* Reinjected packets coming from act_mirred or similar should
- * not get XDP generic processing.
- */
- if (skb_is_redirected(skb))
- return XDP_PASS;
-
- /* XDP packets must be linear and must have sufficient headroom
- * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
- * native XDP provides, thus we need to do it here as well.
- */
- if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
- skb_headroom(skb) < XDP_PACKET_HEADROOM) {
- int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
- int troom = skb->tail + skb->data_len - skb->end;
-
- /* In case we have to go down the path and also linearize,
- * then lets do the pskb_expand_head() work just once here.
- */
- if (pskb_expand_head(skb,
- hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
- troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
- goto do_drop;
- if (skb_linearize(skb))
- goto do_drop;
- }
-
/* The XDP program wants to see the packet starting at the MAC
* header.
*/
@@ -4837,6 +4739,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
skb->protocol = eth_type_trans(skb, skb->dev);
}
+ /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
+ * before calling us again on redirect path. We do not call do_redirect
+ * as we leave that up to the caller.
+ *
+ * Caller is responsible for managing lifetime of skb (i.e. calling
+ * kfree_skb in response to actions it cannot handle/XDP_DROP).
+ */
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
@@ -4847,6 +4756,49 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
if (metalen)
skb_metadata_set(skb, metalen);
break;
+ }
+
+ return act;
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+ struct xdp_buff *xdp,
+ struct bpf_prog *xdp_prog)
+{
+ u32 act = XDP_DROP;
+
+ /* Reinjected packets coming from act_mirred or similar should
+ * not get XDP generic processing.
+ */
+ if (skb_is_redirected(skb))
+ return XDP_PASS;
+
+ /* XDP packets must be linear and must have sufficient headroom
+ * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+ * native XDP provides, thus we need to do it here as well.
+ */
+ if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
+ skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+ int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+ int troom = skb->tail + skb->data_len - skb->end;
+
+ /* In case we have to go down the path and also linearize,
+ * then lets do the pskb_expand_head() work just once here.
+ */
+ if (pskb_expand_head(skb,
+ hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+ troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+ goto do_drop;
+ if (skb_linearize(skb))
+ goto do_drop;
+ }
+
+ act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
+ switch (act) {
+ case XDP_REDIRECT:
+ case XDP_TX:
+ case XDP_PASS:
+ break;
default:
bpf_warn_invalid_xdp_action(act);
fallthrough;
@@ -5129,8 +5081,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
skb->tc_at_ingress = 1;
mini_qdisc_bstats_cpu_update(miniq, skb);
- switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
- &cl_res, false)) {
+ switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
@@ -5312,7 +5263,6 @@ another_round:
ret = NET_RX_DROP;
goto out;
}
- skb_reset_mac_len(skb);
}
if (eth_type_vlan(skb->protocol)) {
@@ -5638,25 +5588,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
struct bpf_prog *new = xdp->prog;
int ret = 0;
- if (new) {
- u32 i;
-
- mutex_lock(&new->aux->used_maps_mutex);
-
- /* generic XDP does not work with DEVMAPs that can
- * have a bpf_prog installed on an entry
- */
- for (i = 0; i < new->aux->used_map_cnt; i++) {
- if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
- cpu_map_prog_allowed(new->aux->used_maps[i])) {
- mutex_unlock(&new->aux->used_maps_mutex);
- return -EINVAL;
- }
- }
-
- mutex_unlock(&new->aux->used_maps_mutex);
- }
-
switch (xdp->command) {
case XDP_SETUP_PROG:
rcu_assign_pointer(dev->xdp_prog, new);
@@ -5864,7 +5795,7 @@ static void flush_all_backlogs(void)
*/
ASSERT_RTNL();
- get_online_cpus();
+ cpus_read_lock();
cpumask_clear(&flush_cpus);
for_each_online_cpu(cpu) {
@@ -5882,7 +5813,7 @@ static void flush_all_backlogs(void)
for_each_cpu(cpu, &flush_cpus)
flush_work(per_cpu_ptr(&flush_works, cpu));
- put_online_cpus();
+ cpus_read_unlock();
}
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
@@ -5999,7 +5930,6 @@ static void gro_list_prepare(const struct list_head *head,
diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
if (skb_vlan_tag_present(p))
diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
- diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_metadata_differs(p, skb);
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
@@ -6008,6 +5938,32 @@ static void gro_list_prepare(const struct list_head *head,
diffs = memcmp(skb_mac_header(p),
skb_mac_header(skb),
maclen);
+
+ /* in most common scenarions 'slow_gro' is 0
+ * otherwise we are already on some slower paths
+ * either skip all the infrequent tests altogether or
+ * avoid trying too hard to skip each of them individually
+ */
+ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) {
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ struct tc_skb_ext *skb_ext;
+ struct tc_skb_ext *p_ext;
+#endif
+
+ diffs |= p->sk != skb->sk;
+ diffs |= skb_metadata_dst_cmp(p, skb);
+ diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
+
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+ p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+ diffs |= (!!p_ext) ^ (!!skb_ext);
+ if (!diffs && unlikely(skb_ext))
+ diffs |= p_ext->chain ^ skb_ext->chain;
+#endif
+ }
+
NAPI_GRO_CB(p)->same_flow = !diffs;
}
}
@@ -6221,6 +6177,8 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
case GRO_MERGED_FREE:
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
napi_skb_free_stolen_head(skb);
+ else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
else
__kfree_skb_defer(skb);
break;
@@ -6269,7 +6227,12 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
- skb_ext_reset(skb);
+ if (unlikely(skb->slow_gro)) {
+ skb_orphan(skb);
+ skb_ext_reset(skb);
+ nf_reset_ct(skb);
+ skb->slow_gro = 0;
+ }
napi->skb = skb;
}
@@ -7569,7 +7532,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
{
struct netdev_adjacent *lower;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
@@ -9334,7 +9297,7 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
return dev->xdp_state[mode].prog;
}
-static u8 dev_xdp_prog_count(struct net_device *dev)
+u8 dev_xdp_prog_count(struct net_device *dev)
{
u8 count = 0;
int i;
@@ -9344,6 +9307,7 @@ static u8 dev_xdp_prog_count(struct net_device *dev)
count++;
return count;
}
+EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
@@ -9437,6 +9401,8 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
{
unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
struct bpf_prog *cur_prog;
+ struct net_device *upper;
+ struct list_head *iter;
enum bpf_xdp_mode mode;
bpf_op_t bpf_op;
int err;
@@ -9475,6 +9441,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
return -EBUSY;
}
+ /* don't allow if an upper device already has a program */
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ if (dev_xdp_prog_count(upper) > 0) {
+ NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
+ return -EEXIST;
+ }
+ }
+
cur_prog = dev_xdp_prog(dev, mode);
/* can't replace attached prog with link */
if (link && cur_prog) {
@@ -9684,14 +9658,17 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
struct net_device *dev;
int err, fd;
+ rtnl_lock();
dev = dev_get_by_index(net, attr->link_create.target_ifindex);
- if (!dev)
+ if (!dev) {
+ rtnl_unlock();
return -EINVAL;
+ }
link = kzalloc(sizeof(*link), GFP_USER);
if (!link) {
err = -ENOMEM;
- goto out_put_dev;
+ goto unlock;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
@@ -9701,14 +9678,14 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
kfree(link);
- goto out_put_dev;
+ goto unlock;
}
- rtnl_lock();
err = dev_xdp_attach_link(dev, NULL, link);
rtnl_unlock();
if (err) {
+ link->dev = NULL;
bpf_link_cleanup(&link_primer);
goto out_put_dev;
}
@@ -9718,6 +9695,9 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
dev_put(dev);
return fd;
+unlock:
+ rtnl_unlock();
+
out_put_dev:
dev_put(dev);
return err;
@@ -10100,7 +10080,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
BUG_ON(count < 1);
- rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!rx)
return -ENOMEM;
@@ -10167,7 +10147,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
if (count < 1 || count > 0xffff)
return -EINVAL;
- tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!tx)
return -ENOMEM;
@@ -10807,7 +10787,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
- p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!p)
return NULL;
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 45ae6eeb2964..8c39283c26ae 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -16,10 +16,9 @@
* General list handling functions
*/
-static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
- const unsigned char *addr, int addr_len,
- unsigned char addr_type, bool global,
- bool sync)
+static struct netdev_hw_addr*
+__hw_addr_create(const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync)
{
struct netdev_hw_addr *ha;
int alloc_size;
@@ -29,32 +28,44 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
alloc_size = L1_CACHE_BYTES;
ha = kmalloc(alloc_size, GFP_ATOMIC);
if (!ha)
- return -ENOMEM;
+ return NULL;
memcpy(ha->addr, addr, addr_len);
ha->type = addr_type;
ha->refcount = 1;
ha->global_use = global;
ha->synced = sync ? 1 : 0;
ha->sync_cnt = 0;
- list_add_tail_rcu(&ha->list, &list->list);
- list->count++;
- return 0;
+ return ha;
}
static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global, bool sync,
- int sync_count)
+ int sync_count, bool exclusive)
{
+ struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
struct netdev_hw_addr *ha;
if (addr_len > MAX_ADDR_LEN)
return -EINVAL;
- list_for_each_entry(ha, &list->list, list) {
- if (ha->type == addr_type &&
- !memcmp(ha->addr, addr, addr_len)) {
+ while (*ins_point) {
+ int diff;
+
+ ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+ diff = memcmp(addr, ha->addr, addr_len);
+ if (diff == 0)
+ diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+ parent = *ins_point;
+ if (diff < 0) {
+ ins_point = &parent->rb_left;
+ } else if (diff > 0) {
+ ins_point = &parent->rb_right;
+ } else {
+ if (exclusive)
+ return -EEXIST;
if (global) {
/* check if addr is already used as global */
if (ha->global_use)
@@ -73,8 +84,25 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
}
}
- return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
- sync);
+ ha = __hw_addr_create(addr, addr_len, addr_type, global, sync);
+ if (!ha)
+ return -ENOMEM;
+
+ /* The first address in dev->dev_addrs is pointed to by dev->dev_addr
+ * and mutated freely by device drivers and netdev ops, so if we insert
+ * it into the tree we'll end up with an invalid rbtree.
+ */
+ if (list->count > 0) {
+ rb_link_node(&ha->node, parent, ins_point);
+ rb_insert_color(&ha->node, &list->tree);
+ } else {
+ RB_CLEAR_NODE(&ha->node);
+ }
+
+ list_add_tail_rcu(&ha->list, &list->list);
+ list->count++;
+
+ return 0;
}
static int __hw_addr_add(struct netdev_hw_addr_list *list,
@@ -82,7 +110,7 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list,
unsigned char addr_type)
{
return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
- 0);
+ 0, false);
}
static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
@@ -103,24 +131,61 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
if (--ha->refcount)
return 0;
+
+ if (!RB_EMPTY_NODE(&ha->node))
+ rb_erase(&ha->node, &list->tree);
+
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
list->count--;
return 0;
}
+static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ struct netdev_hw_addr *ha;
+ struct rb_node *node;
+
+ /* The first address isn't inserted into the tree because in the dev->dev_addrs
+ * list it's the address pointed to by dev->dev_addr which is freely mutated
+ * in place, so we need to check it separately.
+ */
+ ha = list_first_entry(&list->list, struct netdev_hw_addr, list);
+ if (ha && !memcmp(addr, ha->addr, addr_len) &&
+ (!addr_type || addr_type == ha->type))
+ return ha;
+
+ node = list->tree.rb_node;
+
+ while (node) {
+ struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node);
+ int diff = memcmp(addr, ha->addr, addr_len);
+
+ if (diff == 0 && addr_type)
+ diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+ if (diff < 0)
+ node = node->rb_left;
+ else if (diff > 0)
+ node = node->rb_right;
+ else
+ return ha;
+ }
+
+ return NULL;
+}
+
static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global, bool sync)
{
- struct netdev_hw_addr *ha;
+ struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type);
- list_for_each_entry(ha, &list->list, list) {
- if (!memcmp(ha->addr, addr, addr_len) &&
- (ha->type == addr_type || !addr_type))
- return __hw_addr_del_entry(list, ha, global, sync);
- }
- return -ENOENT;
+ if (!ha)
+ return -ENOENT;
+ return __hw_addr_del_entry(list, ha, global, sync);
}
static int __hw_addr_del(struct netdev_hw_addr_list *list,
@@ -137,7 +202,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
int err;
err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
- false, true, ha->sync_cnt);
+ false, true, ha->sync_cnt, false);
if (err && err != -EEXIST)
return err;
@@ -407,6 +472,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
struct netdev_hw_addr *ha, *tmp;
+ list->tree = RB_ROOT;
list_for_each_entry_safe(ha, tmp, &list->list, list) {
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
@@ -418,6 +484,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list)
{
INIT_LIST_HEAD(&list->list);
list->count = 0;
+ list->tree = RB_ROOT;
}
EXPORT_SYMBOL(__hw_addr_init);
@@ -552,22 +619,14 @@ EXPORT_SYMBOL(dev_addr_del);
*/
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
{
- struct netdev_hw_addr *ha;
int err;
netif_addr_lock_bh(dev);
- list_for_each_entry(ha, &dev->uc.list, list) {
- if (!memcmp(ha->addr, addr, dev->addr_len) &&
- ha->type == NETDEV_HW_ADDR_T_UNICAST) {
- err = -EEXIST;
- goto out;
- }
- }
- err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_UNICAST, true, false);
+ err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST, true, false,
+ 0, true);
if (!err)
__dev_set_rx_mode(dev);
-out:
netif_addr_unlock_bh(dev);
return err;
}
@@ -745,22 +804,14 @@ EXPORT_SYMBOL(dev_uc_init);
*/
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
{
- struct netdev_hw_addr *ha;
int err;
netif_addr_lock_bh(dev);
- list_for_each_entry(ha, &dev->mc.list, list) {
- if (!memcmp(ha->addr, addr, dev->addr_len) &&
- ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
- err = -EEXIST;
- goto out;
- }
- }
- err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_MULTICAST, true, false);
+ err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, true, false,
+ 0, true);
if (!err)
__dev_set_rx_mode(dev);
-out:
netif_addr_unlock_bh(dev);
return err;
}
@@ -773,7 +824,8 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
netif_addr_lock_bh(dev);
err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
+ NETDEV_HW_ADDR_T_MULTICAST, global, false,
+ 0, false);
if (!err)
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 478d032f34ac..0e87237fd871 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -1,10 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kmod.h>
#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/net_tstamp.h>
#include <linux/wireless.h>
+#include <linux/if_bridge.h>
#include <net/dsa.h>
#include <net/wext.h>
@@ -25,79 +27,108 @@ static int dev_ifname(struct net *net, struct ifreq *ifr)
return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex);
}
-static gifconf_func_t *gifconf_list[NPROTO];
-
-/**
- * register_gifconf - register a SIOCGIF handler
- * @family: Address family
- * @gifconf: Function handler
- *
- * Register protocol dependent address dumping routines. The handler
- * that is passed must not be freed or reused until it has been replaced
- * by another handler.
- */
-int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
-{
- if (family >= NPROTO)
- return -EINVAL;
- gifconf_list[family] = gifconf;
- return 0;
-}
-EXPORT_SYMBOL(register_gifconf);
-
/*
* Perform a SIOCGIFCONF call. This structure will change
* size eventually, and there is nothing I can do about it.
* Thus we will need a 'compatibility mode'.
*/
-
-int dev_ifconf(struct net *net, struct ifconf *ifc, int size)
+int dev_ifconf(struct net *net, struct ifconf __user *uifc)
{
struct net_device *dev;
- char __user *pos;
- int len;
- int total;
- int i;
+ void __user *pos;
+ size_t size;
+ int len, total = 0, done;
- /*
- * Fetch the caller's info block.
- */
+ /* both the ifconf and the ifreq structures are slightly different */
+ if (in_compat_syscall()) {
+ struct compat_ifconf ifc32;
- pos = ifc->ifc_buf;
- len = ifc->ifc_len;
+ if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf)))
+ return -EFAULT;
- /*
- * Loop over the interfaces, and write an info block for each.
- */
+ pos = compat_ptr(ifc32.ifcbuf);
+ len = ifc32.ifc_len;
+ size = sizeof(struct compat_ifreq);
+ } else {
+ struct ifconf ifc;
+
+ if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
+ return -EFAULT;
- total = 0;
+ pos = ifc.ifc_buf;
+ len = ifc.ifc_len;
+ size = sizeof(struct ifreq);
+ }
+
+ /* Loop over the interfaces, and write an info block for each. */
+ rtnl_lock();
for_each_netdev(net, dev) {
- for (i = 0; i < NPROTO; i++) {
- if (gifconf_list[i]) {
- int done;
- if (!pos)
- done = gifconf_list[i](dev, NULL, 0, size);
- else
- done = gifconf_list[i](dev, pos + total,
- len - total, size);
- if (done < 0)
- return -EFAULT;
- total += done;
- }
+ if (!pos)
+ done = inet_gifconf(dev, NULL, 0, size);
+ else
+ done = inet_gifconf(dev, pos + total,
+ len - total, size);
+ if (done < 0) {
+ rtnl_unlock();
+ return -EFAULT;
}
+ total += done;
}
+ rtnl_unlock();
- /*
- * All done. Write the updated control block back to the caller.
- */
- ifc->ifc_len = total;
+ return put_user(total, &uifc->ifc_len);
+}
+
+static int dev_getifmap(struct net_device *dev, struct ifreq *ifr)
+{
+ struct ifmap *ifmap = &ifr->ifr_map;
+
+ if (in_compat_syscall()) {
+ struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap;
+
+ cifmap->mem_start = dev->mem_start;
+ cifmap->mem_end = dev->mem_end;
+ cifmap->base_addr = dev->base_addr;
+ cifmap->irq = dev->irq;
+ cifmap->dma = dev->dma;
+ cifmap->port = dev->if_port;
+
+ return 0;
+ }
+
+ ifmap->mem_start = dev->mem_start;
+ ifmap->mem_end = dev->mem_end;
+ ifmap->base_addr = dev->base_addr;
+ ifmap->irq = dev->irq;
+ ifmap->dma = dev->dma;
+ ifmap->port = dev->if_port;
- /*
- * Both BSD and Solaris return 0 here, so we do too.
- */
return 0;
}
+static int dev_setifmap(struct net_device *dev, struct ifreq *ifr)
+{
+ struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map;
+
+ if (!dev->netdev_ops->ndo_set_config)
+ return -EOPNOTSUPP;
+
+ if (in_compat_syscall()) {
+ struct ifmap ifmap = {
+ .mem_start = cifmap->mem_start,
+ .mem_end = cifmap->mem_end,
+ .base_addr = cifmap->base_addr,
+ .irq = cifmap->irq,
+ .dma = cifmap->dma,
+ .port = cifmap->port,
+ };
+
+ return dev->netdev_ops->ndo_set_config(dev, &ifmap);
+ }
+
+ return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map);
+}
+
/*
* Perform the SIOCxIFxxx calls, inside rcu_read_lock()
*/
@@ -128,13 +159,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
break;
case SIOCGIFMAP:
- ifr->ifr_map.mem_start = dev->mem_start;
- ifr->ifr_map.mem_end = dev->mem_end;
- ifr->ifr_map.base_addr = dev->base_addr;
- ifr->ifr_map.irq = dev->irq;
- ifr->ifr_map.dma = dev->dma;
- ifr->ifr_map.port = dev->if_port;
- return 0;
+ return dev_getifmap(dev, ifr);
case SIOCGIFINDEX:
ifr->ifr_ifindex = dev->ifindex;
@@ -215,19 +240,19 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
return 0;
}
-static int dev_do_ioctl(struct net_device *dev,
- struct ifreq *ifr, unsigned int cmd)
+static int dev_eth_ioctl(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
- err = dsa_ndo_do_ioctl(dev, ifr, cmd);
+ err = dsa_ndo_eth_ioctl(dev, ifr, cmd);
if (err == 0 || err != -EOPNOTSUPP)
return err;
- if (ops->ndo_do_ioctl) {
+ if (ops->ndo_eth_ioctl) {
if (netif_device_present(dev))
- err = ops->ndo_do_ioctl(dev, ifr, cmd);
+ err = ops->ndo_eth_ioctl(dev, ifr, cmd);
else
err = -ENODEV;
}
@@ -235,10 +260,55 @@ static int dev_do_ioctl(struct net_device *dev,
return err;
}
+static int dev_siocbond(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocbond) {
+ if (netif_device_present(dev))
+ return ops->ndo_siocbond(dev, ifr, cmd);
+ else
+ return -ENODEV;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocdevprivate) {
+ if (netif_device_present(dev))
+ return ops->ndo_siocdevprivate(dev, ifr, data, cmd);
+ else
+ return -ENODEV;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocwandev) {
+ if (netif_device_present(dev))
+ return ops->ndo_siocwandev(dev, ifs);
+ else
+ return -ENODEV;
+ }
+
+ return -EOPNOTSUPP;
+}
+
/*
* Perform the SIOCxIFxxx calls, inside rtnl_lock()
*/
-static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
+ unsigned int cmd)
{
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -275,12 +345,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
return 0;
case SIOCSIFMAP:
- if (ops->ndo_set_config) {
- if (!netif_device_present(dev))
- return -ENODEV;
- return ops->ndo_set_config(dev, &ifr->ifr_map);
- }
- return -EOPNOTSUPP;
+ return dev_setifmap(dev, ifr);
case SIOCADDMULTI:
if (!ops->ndo_set_rx_mode ||
@@ -307,6 +372,22 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
ifr->ifr_newname[IFNAMSIZ-1] = '\0';
return dev_change_name(dev, ifr->ifr_newname);
+ case SIOCWANDEV:
+ return dev_siocwandev(dev, &ifr->ifr_settings);
+
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ if (!netif_is_bridge_master(dev))
+ return -EOPNOTSUPP;
+ dev_hold(dev);
+ rtnl_unlock();
+ err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL);
+ dev_put(dev);
+ rtnl_lock();
+ return err;
+
case SIOCSHWTSTAMP:
err = net_hwtstamp_validate(ifr);
if (err)
@@ -317,23 +398,23 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
* Unknown or private ioctl
*/
default:
- if ((cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15) ||
- cmd == SIOCBONDENSLAVE ||
+ if (cmd >= SIOCDEVPRIVATE &&
+ cmd <= SIOCDEVPRIVATE + 15)
+ return dev_siocdevprivate(dev, ifr, data, cmd);
+
+ if (cmd == SIOCGMIIPHY ||
+ cmd == SIOCGMIIREG ||
+ cmd == SIOCSMIIREG ||
+ cmd == SIOCSHWTSTAMP ||
+ cmd == SIOCGHWTSTAMP) {
+ err = dev_eth_ioctl(dev, ifr, cmd);
+ } else if (cmd == SIOCBONDENSLAVE ||
cmd == SIOCBONDRELEASE ||
cmd == SIOCBONDSETHWADDR ||
cmd == SIOCBONDSLAVEINFOQUERY ||
cmd == SIOCBONDINFOQUERY ||
- cmd == SIOCBONDCHANGEACTIVE ||
- cmd == SIOCGMIIPHY ||
- cmd == SIOCGMIIREG ||
- cmd == SIOCSMIIREG ||
- cmd == SIOCBRADDIF ||
- cmd == SIOCBRDELIF ||
- cmd == SIOCSHWTSTAMP ||
- cmd == SIOCGHWTSTAMP ||
- cmd == SIOCWANDEV) {
- err = dev_do_ioctl(dev, ifr, cmd);
+ cmd == SIOCBONDCHANGEACTIVE) {
+ err = dev_siocbond(dev, ifr, cmd);
} else
err = -EINVAL;
@@ -386,7 +467,8 @@ EXPORT_SYMBOL(dev_load);
* positive or a negative errno code on error.
*/
-int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout)
+int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
+ void __user *data, bool *need_copyout)
{
int ret;
char *colon;
@@ -437,7 +519,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCETHTOOL:
dev_load(net, ifr->ifr_name);
rtnl_lock();
- ret = dev_ethtool(net, ifr);
+ ret = dev_ethtool(net, ifr, data);
rtnl_unlock();
if (colon)
*colon = ':';
@@ -456,7 +538,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
rtnl_lock();
- ret = dev_ifsioc(net, ifr, cmd);
+ ret = dev_ifsioc(net, ifr, data, cmd);
rtnl_unlock();
if (colon)
*colon = ':';
@@ -502,7 +584,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCBONDINFOQUERY:
dev_load(net, ifr->ifr_name);
rtnl_lock();
- ret = dev_ifsioc(net, ifr, cmd);
+ ret = dev_ifsioc(net, ifr, data, cmd);
rtnl_unlock();
if (need_copyout)
*need_copyout = false;
@@ -527,7 +609,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
cmd <= SIOCDEVPRIVATE + 15)) {
dev_load(net, ifr->ifr_name);
rtnl_lock();
- ret = dev_ifsioc(net, ifr, cmd);
+ ret = dev_ifsioc(net, ifr, data, cmd);
rtnl_unlock();
return ret;
}
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8fdd04f00fd7..a856ae401ea5 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -92,7 +92,8 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_
DEVLINK_PORT_FN_STATE_ACTIVE),
};
-static LIST_HEAD(devlink_list);
+static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
+#define DEVLINK_REGISTERED XA_MARK_1
/* devlink_mutex
*
@@ -108,23 +109,23 @@ struct net *devlink_net(const struct devlink *devlink)
}
EXPORT_SYMBOL_GPL(devlink_net);
-static void __devlink_net_set(struct devlink *devlink, struct net *net)
+static void devlink_put(struct devlink *devlink)
{
- write_pnet(&devlink->_net, net);
+ if (refcount_dec_and_test(&devlink->refcount))
+ complete(&devlink->comp);
}
-void devlink_net_set(struct devlink *devlink, struct net *net)
+static bool __must_check devlink_try_get(struct devlink *devlink)
{
- if (WARN_ON(devlink->registered))
- return;
- __devlink_net_set(devlink, net);
+ return refcount_inc_not_zero(&devlink->refcount);
}
-EXPORT_SYMBOL_GPL(devlink_net_set);
static struct devlink *devlink_get_from_attrs(struct net *net,
struct nlattr **attrs)
{
struct devlink *devlink;
+ unsigned long index;
+ bool found = false;
char *busname;
char *devname;
@@ -136,19 +137,19 @@ static struct devlink *devlink_get_from_attrs(struct net *net,
lockdep_assert_held(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
if (strcmp(devlink->dev->bus->name, busname) == 0 &&
strcmp(dev_name(devlink->dev), devname) == 0 &&
- net_eq(devlink_net(devlink), net))
- return devlink;
+ net_eq(devlink_net(devlink), net)) {
+ found = true;
+ break;
+ }
}
- return ERR_PTR(-ENODEV);
-}
+ if (!found || !devlink_try_get(devlink))
+ devlink = ERR_PTR(-ENODEV);
-static struct devlink *devlink_get_from_info(struct genl_info *info)
-{
- return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+ return devlink;
}
static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
@@ -499,7 +500,7 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
int err;
mutex_lock(&devlink_mutex);
- devlink = devlink_get_from_info(info);
+ devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs);
if (IS_ERR(devlink)) {
mutex_unlock(&devlink_mutex);
return PTR_ERR(devlink);
@@ -542,6 +543,7 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
unlock:
if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
mutex_unlock(&devlink_mutex);
return err;
}
@@ -554,6 +556,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
devlink = info->user_ptr[0];
if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
mutex_unlock(&devlink_mutex);
}
@@ -817,10 +820,11 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
return 0;
}
-static int
-devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *ops,
- struct devlink_port *port, struct sk_buff *msg,
- struct netlink_ext_ack *extack, bool *msg_updated)
+static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops,
+ struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
{
u8 hw_addr[MAX_ADDR_LEN];
int hw_addr_len;
@@ -829,7 +833,8 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *
if (!ops->port_function_hw_addr_get)
return 0;
- err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack);
+ err = ops->port_function_hw_addr_get(port, hw_addr, &hw_addr_len,
+ extack);
if (err) {
if (err == -EOPNOTSUPP)
return 0;
@@ -843,12 +848,11 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *
}
static int devlink_nl_rate_fill(struct sk_buff *msg,
- struct devlink *devlink,
struct devlink_rate *devlink_rate,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags,
- struct netlink_ext_ack *extack)
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
{
+ struct devlink *devlink = devlink_rate->devlink;
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
@@ -906,12 +910,11 @@ devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate)
opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED;
}
-static int
-devlink_port_fn_state_fill(struct devlink *devlink,
- const struct devlink_ops *ops,
- struct devlink_port *port, struct sk_buff *msg,
- struct netlink_ext_ack *extack,
- bool *msg_updated)
+static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
+ struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
{
enum devlink_port_fn_opstate opstate;
enum devlink_port_fn_state state;
@@ -920,7 +923,7 @@ devlink_port_fn_state_fill(struct devlink *devlink,
if (!ops->port_fn_state_get)
return 0;
- err = ops->port_fn_state_get(devlink, port, &state, &opstate, extack);
+ err = ops->port_fn_state_get(port, &state, &opstate, extack);
if (err) {
if (err == -EOPNOTSUPP)
return 0;
@@ -948,7 +951,6 @@ static int
devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
struct netlink_ext_ack *extack)
{
- struct devlink *devlink = port->devlink;
const struct devlink_ops *ops;
struct nlattr *function_attr;
bool msg_updated = false;
@@ -958,13 +960,12 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
if (!function_attr)
return -EMSGSIZE;
- ops = devlink->ops;
- err = devlink_port_fn_hw_addr_fill(devlink, ops, port, msg,
- extack, &msg_updated);
+ ops = port->devlink->ops;
+ err = devlink_port_fn_hw_addr_fill(ops, port, msg, extack,
+ &msg_updated);
if (err)
goto out;
- err = devlink_port_fn_state_fill(devlink, ops, port, msg, extack,
- &msg_updated);
+ err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
out:
if (err || !msg_updated)
nla_nest_cancel(msg, function_attr);
@@ -973,12 +974,12 @@ out:
return err;
}
-static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+static int devlink_nl_port_fill(struct sk_buff *msg,
struct devlink_port *devlink_port,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags,
- struct netlink_ext_ack *extack)
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
{
+ struct devlink *devlink = devlink_port->devlink;
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
@@ -1039,53 +1040,47 @@ nla_put_failure:
static void devlink_port_notify(struct devlink_port *devlink_port,
enum devlink_command cmd)
{
- struct devlink *devlink = devlink_port->devlink;
struct sk_buff *msg;
int err;
- if (!devlink_port->registered)
- return;
-
WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
- err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0,
- NULL);
+ err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL);
if (err) {
nlmsg_free(msg);
return;
}
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+ genlmsg_multicast_netns(&devlink_nl_family,
+ devlink_net(devlink_port->devlink), msg, 0,
+ DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static void devlink_rate_notify(struct devlink_rate *devlink_rate,
enum devlink_command cmd)
{
- struct devlink *devlink = devlink_rate->devlink;
struct sk_buff *msg;
int err;
- WARN_ON(cmd != DEVLINK_CMD_RATE_NEW &&
- cmd != DEVLINK_CMD_RATE_DEL);
+ WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL);
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
- err = devlink_nl_rate_fill(msg, devlink, devlink_rate,
- cmd, 0, 0, 0, NULL);
+ err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL);
if (err) {
nlmsg_free(msg);
return;
}
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+ genlmsg_multicast_netns(&devlink_nl_family,
+ devlink_net(devlink_rate->devlink), msg, 0,
+ DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg,
@@ -1094,13 +1089,18 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg,
struct devlink_rate *devlink_rate;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
enum devlink_command cmd = DEVLINK_CMD_RATE_NEW;
@@ -1110,18 +1110,19 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg,
idx++;
continue;
}
- err = devlink_nl_rate_fill(msg, devlink,
- devlink_rate,
- cmd, id,
+ err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id,
cb->nlh->nlmsg_seq,
NLM_F_MULTI, NULL);
if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -1136,7 +1137,6 @@ static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_rate *devlink_rate = info->user_ptr[1];
- struct devlink *devlink = devlink_rate->devlink;
struct sk_buff *msg;
int err;
@@ -1144,8 +1144,7 @@ static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb,
if (!msg)
return -ENOMEM;
- err = devlink_nl_rate_fill(msg, devlink, devlink_rate,
- DEVLINK_CMD_RATE_NEW,
+ err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW,
info->snd_portid, info->snd_seq, 0,
info->extack);
if (err) {
@@ -1193,20 +1192,30 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
{
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
+ continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) {
+ devlink_put(devlink);
continue;
+ }
+
if (idx < start) {
idx++;
+ devlink_put(devlink);
continue;
}
+
err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ devlink_put(devlink);
if (err)
goto out;
idx++;
@@ -1222,7 +1231,6 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
- struct devlink *devlink = devlink_port->devlink;
struct sk_buff *msg;
int err;
@@ -1230,8 +1238,7 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
if (!msg)
return -ENOMEM;
- err = devlink_nl_port_fill(msg, devlink, devlink_port,
- DEVLINK_CMD_PORT_NEW,
+ err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
info->snd_portid, info->snd_seq, 0,
info->extack);
if (err) {
@@ -1248,32 +1255,39 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_port *devlink_port;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_port, &devlink->port_list, list) {
if (idx < start) {
idx++;
continue;
}
- err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ err = devlink_nl_port_fill(msg, devlink_port,
DEVLINK_CMD_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
- NLM_F_MULTI,
- cb->extack);
+ NLM_F_MULTI, cb->extack);
if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -1282,31 +1296,33 @@ out:
return msg->len;
}
-static int devlink_port_type_set(struct devlink *devlink,
- struct devlink_port *devlink_port,
+static int devlink_port_type_set(struct devlink_port *devlink_port,
enum devlink_port_type port_type)
{
int err;
- if (devlink->ops->port_type_set) {
- if (port_type == devlink_port->type)
- return 0;
- err = devlink->ops->port_type_set(devlink_port, port_type);
- if (err)
- return err;
- devlink_port->desired_type = port_type;
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ if (!devlink_port->devlink->ops->port_type_set)
+ return -EOPNOTSUPP;
+
+ if (port_type == devlink_port->type)
return 0;
- }
- return -EOPNOTSUPP;
+
+ err = devlink_port->devlink->ops->port_type_set(devlink_port,
+ port_type);
+ if (err)
+ return err;
+
+ devlink_port->desired_type = port_type;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
}
-static int
-devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *port,
- const struct nlattr *attr, struct netlink_ext_ack *extack)
+static int devlink_port_function_hw_addr_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
{
- const struct devlink_ops *ops;
+ const struct devlink_ops *ops = port->devlink->ops;
const u8 *hw_addr;
int hw_addr_len;
@@ -1327,17 +1343,16 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *
}
}
- ops = devlink->ops;
if (!ops->port_function_hw_addr_set) {
NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes");
return -EOPNOTSUPP;
}
- return ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack);
+ return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len,
+ extack);
}
-static int devlink_port_fn_state_set(struct devlink *devlink,
- struct devlink_port *port,
+static int devlink_port_fn_state_set(struct devlink_port *port,
const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
@@ -1345,18 +1360,18 @@ static int devlink_port_fn_state_set(struct devlink *devlink,
const struct devlink_ops *ops;
state = nla_get_u8(attr);
- ops = devlink->ops;
+ ops = port->devlink->ops;
if (!ops->port_fn_state_set) {
NL_SET_ERR_MSG_MOD(extack,
"Function does not support state setting");
return -EOPNOTSUPP;
}
- return ops->port_fn_state_set(devlink, port, state, extack);
+ return ops->port_fn_state_set(port, state, extack);
}
-static int
-devlink_port_function_set(struct devlink *devlink, struct devlink_port *port,
- const struct nlattr *attr, struct netlink_ext_ack *extack)
+static int devlink_port_function_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
{
struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1];
int err;
@@ -1370,7 +1385,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port,
attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR];
if (attr) {
- err = devlink_port_function_hw_addr_set(devlink, port, attr, extack);
+ err = devlink_port_function_hw_addr_set(port, attr, extack);
if (err)
return err;
}
@@ -1380,7 +1395,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port,
*/
attr = tb[DEVLINK_PORT_FN_ATTR_STATE];
if (attr)
- err = devlink_port_fn_state_set(devlink, port, attr, extack);
+ err = devlink_port_fn_state_set(port, attr, extack);
if (!err)
devlink_port_notify(port, DEVLINK_CMD_PORT_NEW);
@@ -1391,14 +1406,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
- struct devlink *devlink = devlink_port->devlink;
int err;
if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
enum devlink_port_type port_type;
port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
- err = devlink_port_type_set(devlink, devlink_port, port_type);
+ err = devlink_port_type_set(devlink_port, port_type);
if (err)
return err;
}
@@ -1407,7 +1421,7 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION];
struct netlink_ext_ack *extack = info->extack;
- err = devlink_port_function_set(devlink, devlink_port, attr, extack);
+ err = devlink_port_function_set(devlink_port, attr, extack);
if (err)
return err;
}
@@ -1502,9 +1516,8 @@ static int devlink_port_new_notifiy(struct devlink *devlink,
goto out;
}
- err = devlink_nl_port_fill(msg, devlink, devlink_port,
- DEVLINK_CMD_NEW, info->snd_portid,
- info->snd_seq, 0, NULL);
+ err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW,
+ info->snd_portid, info->snd_seq, 0, NULL);
if (err)
goto out;
@@ -1908,13 +1921,18 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_sb *devlink_sb;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
if (idx < start) {
@@ -1928,11 +1946,14 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
NLM_F_MULTI);
if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -2052,14 +2073,19 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_sb *devlink_sb;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
+ continue;
+
if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
!devlink->ops->sb_pool_get)
- continue;
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
@@ -2070,10 +2096,13 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
err = 0;
} else if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -2265,14 +2294,19 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_sb *devlink_sb;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
+ continue;
+
if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
!devlink->ops->sb_port_pool_get)
- continue;
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
err = __sb_port_pool_get_dumpit(msg, start, &idx,
@@ -2283,10 +2317,13 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
err = 0;
} else if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -2506,14 +2543,18 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
struct devlink *devlink;
struct devlink_sb *devlink_sb;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
+ continue;
+
if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
!devlink->ops->sb_tc_pool_bind_get)
- continue;
+ goto retry;
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
@@ -2526,10 +2567,13 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
err = 0;
} else if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -3801,10 +3845,12 @@ static void devlink_param_notify(struct devlink *devlink,
struct devlink_param_item *param_item,
enum devlink_command cmd);
-static void devlink_reload_netns_change(struct devlink *devlink,
- struct net *dest_net)
+static void devlink_ns_change_notify(struct devlink *devlink,
+ struct net *dest_net, struct net *curr_net,
+ bool new)
{
struct devlink_param_item *param_item;
+ enum devlink_command cmd;
/* Userspace needs to be notified about devlink objects
* removed from original and entering new network namespace.
@@ -3812,17 +3858,18 @@ static void devlink_reload_netns_change(struct devlink *devlink,
* reload process so the notifications are generated separatelly.
*/
- list_for_each_entry(param_item, &devlink->param_list, list)
- devlink_param_notify(devlink, 0, param_item,
- DEVLINK_CMD_PARAM_DEL);
- devlink_notify(devlink, DEVLINK_CMD_DEL);
+ if (!dest_net || net_eq(dest_net, curr_net))
+ return;
- __devlink_net_set(devlink, dest_net);
+ if (new)
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
- devlink_notify(devlink, DEVLINK_CMD_NEW);
+ cmd = new ? DEVLINK_CMD_PARAM_NEW : DEVLINK_CMD_PARAM_DEL;
list_for_each_entry(param_item, &devlink->param_list, list)
- devlink_param_notify(devlink, 0, param_item,
- DEVLINK_CMD_PARAM_NEW);
+ devlink_param_notify(devlink, 0, param_item, cmd);
+
+ if (!new)
+ devlink_notify(devlink, DEVLINK_CMD_DEL);
}
static bool devlink_reload_supported(const struct devlink_ops *ops)
@@ -3902,6 +3949,7 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net,
u32 *actions_performed, struct netlink_ext_ack *extack)
{
u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+ struct net *curr_net;
int err;
if (!devlink->reload_enabled)
@@ -3909,18 +3957,22 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net,
memcpy(remote_reload_stats, devlink->stats.remote_reload_stats,
sizeof(remote_reload_stats));
+
+ curr_net = devlink_net(devlink);
+ devlink_ns_change_notify(devlink, dest_net, curr_net, false);
err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack);
if (err)
return err;
- if (dest_net && !net_eq(dest_net, devlink_net(devlink)))
- devlink_reload_netns_change(devlink, dest_net);
+ if (dest_net && !net_eq(dest_net, curr_net))
+ write_pnet(&devlink->_net, dest_net);
err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack);
devlink_reload_failed_set(devlink, !!err);
if (err)
return err;
+ devlink_ns_change_notify(devlink, dest_net, curr_net, true);
WARN_ON(!(*actions_performed & BIT(action)));
/* Catch driver on updating the remote action within devlink reload */
WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats,
@@ -4117,7 +4169,7 @@ out_free_msg:
static void devlink_flash_update_begin_notify(struct devlink *devlink)
{
- struct devlink_flash_notify params = { 0 };
+ struct devlink_flash_notify params = {};
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE,
@@ -4126,7 +4178,7 @@ static void devlink_flash_update_begin_notify(struct devlink *devlink)
static void devlink_flash_update_end_notify(struct devlink *devlink)
{
- struct devlink_flash_notify params = { 0 };
+ struct devlink_flash_notify params = {};
__devlink_flash_update_notify(devlink,
DEVLINK_CMD_FLASH_UPDATE_END,
@@ -4283,6 +4335,21 @@ static const struct devlink_param devlink_param_generic[] = {
.name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE,
},
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE,
+ },
};
static int devlink_param_generic_verify(const struct devlink_param *param)
@@ -4553,13 +4620,18 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
struct devlink_param_item *param_item;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(param_item, &devlink->param_list, list) {
if (idx < start) {
@@ -4575,11 +4647,14 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
err = 0;
} else if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -4821,13 +4896,18 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
struct devlink_port *devlink_port;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(devlink_port, &devlink->port_list, list) {
list_for_each_entry(param_item,
@@ -4847,12 +4927,15 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
err = 0;
} else if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -5062,7 +5145,6 @@ static void devlink_nl_region_notify(struct devlink_region *region,
struct devlink_snapshot *snapshot,
enum devlink_command cmd)
{
- struct devlink *devlink = region->devlink;
struct sk_buff *msg;
WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
@@ -5071,8 +5153,9 @@ static void devlink_nl_region_notify(struct devlink_region *region,
if (IS_ERR(msg))
return;
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+ genlmsg_multicast_netns(&devlink_nl_family,
+ devlink_net(region->devlink), msg, 0,
+ DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
/**
@@ -5390,15 +5473,22 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
{
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
- int err;
+ int err = 0;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink,
&idx, start);
+retry:
+ devlink_put(devlink);
if (err)
goto out;
}
@@ -5761,6 +5851,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
nla_nest_end(skb, chunks_attr);
genlmsg_end(skb, hdr);
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
mutex_unlock(&devlink_mutex);
return skb->len;
@@ -5769,6 +5860,7 @@ nla_put_failure:
genlmsg_cancel(skb, hdr);
out_unlock:
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
out_dev:
mutex_unlock(&devlink_mutex);
return err;
@@ -5915,22 +6007,20 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
{
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err = 0;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
- if (idx < start) {
- idx++;
- continue;
- }
- if (!devlink->ops->info_get) {
- idx++;
- continue;
- }
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
+ if (idx < start || !devlink->ops->info_get)
+ goto inc;
mutex_lock(&devlink->lock);
err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
@@ -5940,9 +6030,14 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg,
mutex_unlock(&devlink->lock);
if (err == -EOPNOTSUPP)
err = 0;
- else if (err)
+ else if (err) {
+ devlink_put(devlink);
break;
+ }
+inc:
idx++;
+retry:
+ devlink_put(devlink);
}
mutex_unlock(&devlink_mutex);
@@ -6756,11 +6851,11 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy);
static int
devlink_nl_health_reporter_fill(struct sk_buff *msg,
- struct devlink *devlink,
struct devlink_health_reporter *reporter,
enum devlink_command cmd, u32 portid,
u32 seq, int flags)
{
+ struct devlink *devlink = reporter->devlink;
struct nlattr *reporter_attr;
void *hdr;
@@ -6837,8 +6932,7 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter,
if (!msg)
return;
- err = devlink_nl_health_reporter_fill(msg, reporter->devlink,
- reporter, cmd, 0, 0, 0);
+ err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0);
if (err) {
nlmsg_free(msg);
return;
@@ -7028,6 +7122,7 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
goto unlock;
reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
+ devlink_put(devlink);
mutex_unlock(&devlink_mutex);
return reporter;
unlock:
@@ -7071,7 +7166,7 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb,
goto out;
}
- err = devlink_nl_health_reporter_fill(msg, devlink, reporter,
+ err = devlink_nl_health_reporter_fill(msg, reporter,
DEVLINK_CMD_HEALTH_REPORTER_GET,
info->snd_portid, info->snd_seq,
0);
@@ -7094,13 +7189,18 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
struct devlink_port *port;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry_rep;
+
mutex_lock(&devlink->reporters_lock);
list_for_each_entry(reporter, &devlink->reporter_list,
list) {
@@ -7108,24 +7208,29 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
idx++;
continue;
}
- err = devlink_nl_health_reporter_fill(msg, devlink,
- reporter,
- DEVLINK_CMD_HEALTH_REPORTER_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
+ err = devlink_nl_health_reporter_fill(
+ msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET,
+ NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
if (err) {
mutex_unlock(&devlink->reporters_lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
mutex_unlock(&devlink->reporters_lock);
+retry_rep:
+ devlink_put(devlink);
}
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry_port;
+
mutex_lock(&devlink->lock);
list_for_each_entry(port, &devlink->port_list, list) {
mutex_lock(&port->reporters_lock);
@@ -7134,14 +7239,15 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
idx++;
continue;
}
- err = devlink_nl_health_reporter_fill(msg, devlink, reporter,
- DEVLINK_CMD_HEALTH_REPORTER_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
+ err = devlink_nl_health_reporter_fill(
+ msg, reporter,
+ DEVLINK_CMD_HEALTH_REPORTER_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
if (err) {
mutex_unlock(&port->reporters_lock);
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
idx++;
@@ -7149,6 +7255,8 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg,
mutex_unlock(&port->reporters_lock);
}
mutex_unlock(&devlink->lock);
+retry_port:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -7677,13 +7785,18 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg,
struct devlink_trap_item *trap_item;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(trap_item, &devlink->trap_list, list) {
if (idx < start) {
@@ -7697,11 +7810,14 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg,
NLM_F_MULTI);
if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -7896,13 +8012,18 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg,
u32 portid = NETLINK_CB(cb->skb).portid;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(group_item, &devlink->trap_group_list,
list) {
@@ -7917,11 +8038,14 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg,
NLM_F_MULTI);
if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -8202,13 +8326,18 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg,
u32 portid = NETLINK_CB(cb->skb).portid;
struct devlink *devlink;
int start = cb->args[0];
+ unsigned long index;
int idx = 0;
int err;
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
continue;
+
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ goto retry;
+
mutex_lock(&devlink->lock);
list_for_each_entry(policer_item, &devlink->trap_policer_list,
list) {
@@ -8223,11 +8352,14 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg,
NLM_F_MULTI);
if (err) {
mutex_unlock(&devlink->lock);
+ devlink_put(devlink);
goto out;
}
idx++;
}
mutex_unlock(&devlink->lock);
+retry:
+ devlink_put(devlink);
}
out:
mutex_unlock(&devlink_mutex);
@@ -8768,30 +8900,44 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops)
}
/**
- * devlink_alloc - Allocate new devlink instance resources
+ * devlink_alloc_ns - Allocate new devlink instance resources
+ * in specific namespace
*
* @ops: ops
* @priv_size: size of user private data
+ * @net: net namespace
+ * @dev: parent device
*
* Allocate new devlink instance resources, including devlink index
* and name.
*/
-struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
+ size_t priv_size, struct net *net,
+ struct device *dev)
{
struct devlink *devlink;
+ static u32 last_id;
+ int ret;
- if (WARN_ON(!ops))
- return NULL;
-
+ WARN_ON(!ops || !dev);
if (!devlink_reload_actions_valid(ops))
return NULL;
devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
if (!devlink)
return NULL;
+
+ ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
+ &last_id, GFP_KERNEL);
+ if (ret < 0) {
+ kfree(devlink);
+ return NULL;
+ }
+
+ devlink->dev = dev;
devlink->ops = ops;
xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
- __devlink_net_set(devlink, &init_net);
+ write_pnet(&devlink->_net, net);
INIT_LIST_HEAD(&devlink->port_list);
INIT_LIST_HEAD(&devlink->rate_list);
INIT_LIST_HEAD(&devlink->sb_list);
@@ -8805,22 +8951,22 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
INIT_LIST_HEAD(&devlink->trap_policer_list);
mutex_init(&devlink->lock);
mutex_init(&devlink->reporters_lock);
+ refcount_set(&devlink->refcount, 1);
+ init_completion(&devlink->comp);
+
return devlink;
}
-EXPORT_SYMBOL_GPL(devlink_alloc);
+EXPORT_SYMBOL_GPL(devlink_alloc_ns);
/**
* devlink_register - Register devlink instance
*
* @devlink: devlink
- * @dev: parent device
*/
-int devlink_register(struct devlink *devlink, struct device *dev)
+int devlink_register(struct devlink *devlink)
{
- devlink->dev = dev;
- devlink->registered = true;
mutex_lock(&devlink_mutex);
- list_add_tail(&devlink->list, &devlink_list);
+ xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
devlink_notify(devlink, DEVLINK_CMD_NEW);
mutex_unlock(&devlink_mutex);
return 0;
@@ -8834,11 +8980,14 @@ EXPORT_SYMBOL_GPL(devlink_register);
*/
void devlink_unregister(struct devlink *devlink)
{
+ devlink_put(devlink);
+ wait_for_completion(&devlink->comp);
+
mutex_lock(&devlink_mutex);
WARN_ON(devlink_reload_supported(devlink->ops) &&
devlink->reload_enabled);
devlink_notify(devlink, DEVLINK_CMD_DEL);
- list_del(&devlink->list);
+ xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
mutex_unlock(&devlink_mutex);
}
EXPORT_SYMBOL_GPL(devlink_unregister);
@@ -8900,6 +9049,7 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->port_list));
xa_destroy(&devlink->snapshot_ids);
+ xa_erase(&devlinks, devlink->index);
kfree(devlink);
}
@@ -8960,9 +9110,10 @@ int devlink_port_register(struct devlink *devlink,
mutex_unlock(&devlink->lock);
return -EEXIST;
}
+
+ WARN_ON(devlink_port->devlink);
devlink_port->devlink = devlink;
devlink_port->index = port_index;
- devlink_port->registered = true;
spin_lock_init(&devlink_port->type_lock);
INIT_LIST_HEAD(&devlink_port->reporter_list);
mutex_init(&devlink_port->reporters_lock);
@@ -9001,7 +9152,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
enum devlink_port_type type,
void *type_dev)
{
- if (WARN_ON(!devlink_port->registered))
+ if (WARN_ON(!devlink_port->devlink))
return;
devlink_port_type_warn_cancel(devlink_port);
spin_lock_bh(&devlink_port->type_lock);
@@ -9121,7 +9272,7 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
{
int ret;
- if (WARN_ON(devlink_port->registered))
+ if (WARN_ON(devlink_port->devlink))
return;
devlink_port->attrs = *attrs;
ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
@@ -9145,7 +9296,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
- if (WARN_ON(devlink_port->registered))
+ if (WARN_ON(devlink_port->devlink))
return;
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_PF);
@@ -9172,7 +9323,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
- if (WARN_ON(devlink_port->registered))
+ if (WARN_ON(devlink_port->devlink))
return;
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_VF);
@@ -9200,7 +9351,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
- if (WARN_ON(devlink_port->registered))
+ if (WARN_ON(devlink_port->devlink))
return;
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_SF);
@@ -9328,18 +9479,10 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
switch (attrs->flavour) {
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
- case DEVLINK_PORT_FLAVOUR_VIRTUAL:
n = snprintf(name, len, "p%u", attrs->phys.port_number);
if (n < len && attrs->split)
n += snprintf(name + n, len - n, "s%u",
attrs->phys.split_subport_number);
- if (!attrs->split)
- n = snprintf(name, len, "p%u", attrs->phys.port_number);
- else
- n = snprintf(name, len, "p%us%u",
- attrs->phys.port_number,
- attrs->phys.split_subport_number);
-
break;
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
@@ -9381,6 +9524,8 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
attrs->pci_sf.sf);
break;
+ case DEVLINK_PORT_FLAVOUR_VIRTUAL:
+ return -EOPNOTSUPP;
}
if (n >= len)
@@ -9794,6 +9939,22 @@ static int devlink_param_verify(const struct devlink_param *param)
return devlink_param_driver_verify(param);
}
+static int __devlink_param_register_one(struct devlink *devlink,
+ unsigned int port_index,
+ struct list_head *param_list,
+ const struct devlink_param *param,
+ enum devlink_command reg_cmd)
+{
+ int err;
+
+ err = devlink_param_verify(param);
+ if (err)
+ return err;
+
+ return devlink_param_register_one(devlink, port_index,
+ param_list, param, reg_cmd);
+}
+
static int __devlink_params_register(struct devlink *devlink,
unsigned int port_index,
struct list_head *param_list,
@@ -9808,12 +9969,8 @@ static int __devlink_params_register(struct devlink *devlink,
mutex_lock(&devlink->lock);
for (i = 0; i < params_count; i++, param++) {
- err = devlink_param_verify(param);
- if (err)
- goto rollback;
-
- err = devlink_param_register_one(devlink, port_index,
- param_list, param, reg_cmd);
+ err = __devlink_param_register_one(devlink, port_index,
+ param_list, param, reg_cmd);
if (err)
goto rollback;
}
@@ -9886,6 +10043,43 @@ void devlink_params_unregister(struct devlink *devlink,
EXPORT_SYMBOL_GPL(devlink_params_unregister);
/**
+ * devlink_param_register - register one configuration parameter
+ *
+ * @devlink: devlink
+ * @param: one configuration parameter
+ *
+ * Register the configuration parameter supported by the driver.
+ * Return: returns 0 on successful registration or error code otherwise.
+ */
+int devlink_param_register(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ int err;
+
+ mutex_lock(&devlink->lock);
+ err = __devlink_param_register_one(devlink, 0, &devlink->param_list,
+ param, DEVLINK_CMD_PARAM_NEW);
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_param_register);
+
+/**
+ * devlink_param_unregister - unregister one configuration parameter
+ * @devlink: devlink
+ * @param: configuration parameter to unregister
+ */
+void devlink_param_unregister(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ mutex_lock(&devlink->lock);
+ devlink_param_unregister_one(devlink, 0, &devlink->param_list, param,
+ DEVLINK_CMD_PARAM_DEL);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_param_unregister);
+
+/**
* devlink_params_publish - publish configuration parameters
*
* @devlink: devlink
@@ -9928,6 +10122,54 @@ void devlink_params_unpublish(struct devlink *devlink)
EXPORT_SYMBOL_GPL(devlink_params_unpublish);
/**
+ * devlink_param_publish - publish one configuration parameter
+ *
+ * @devlink: devlink
+ * @param: one configuration parameter
+ *
+ * Publish previously registered configuration parameter.
+ */
+void devlink_param_publish(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ list_for_each_entry(param_item, &devlink->param_list, list) {
+ if (param_item->param != param || param_item->published)
+ continue;
+ param_item->published = true;
+ devlink_param_notify(devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_NEW);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(devlink_param_publish);
+
+/**
+ * devlink_param_unpublish - unpublish one configuration parameter
+ *
+ * @devlink: devlink
+ * @param: one configuration parameter
+ *
+ * Unpublish previously registered configuration parameter.
+ */
+void devlink_param_unpublish(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ list_for_each_entry(param_item, &devlink->param_list, list) {
+ if (param_item->param != param || !param_item->published)
+ continue;
+ param_item->published = false;
+ devlink_param_notify(devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_DEL);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(devlink_param_unpublish);
+
+/**
* devlink_port_params_register - register port configuration parameters
*
* @devlink_port: devlink port
@@ -11282,23 +11524,29 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
{
struct devlink *devlink;
u32 actions_performed;
+ unsigned long index;
int err;
/* In case network namespace is getting destroyed, reload
* all devlink instances from this namespace into init_net.
*/
mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (net_eq(devlink_net(devlink), net)) {
- if (WARN_ON(!devlink_reload_supported(devlink->ops)))
- continue;
- err = devlink_reload(devlink, &init_net,
- DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
- DEVLINK_RELOAD_LIMIT_UNSPEC,
- &actions_performed, NULL);
- if (err && err != -EOPNOTSUPP)
- pr_warn("Failed to reload devlink instance into init_net\n");
- }
+ xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) {
+ if (!devlink_try_get(devlink))
+ continue;
+
+ if (!net_eq(devlink_net(devlink), net))
+ goto retry;
+
+ WARN_ON(!devlink_reload_supported(devlink->ops));
+ err = devlink_reload(devlink, &init_net,
+ DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ DEVLINK_RELOAD_LIMIT_UNSPEC,
+ &actions_performed, NULL);
+ if (err && err != -EOPNOTSUPP)
+ pr_warn("Failed to reload devlink instance into init_net\n");
+retry:
+ devlink_put(devlink);
}
mutex_unlock(&devlink_mutex);
}
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index ead2a8aa57b4..49442cae6f69 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -850,8 +850,7 @@ net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata)
}
hw_metadata->input_dev = metadata->input_dev;
- if (hw_metadata->input_dev)
- dev_hold(hw_metadata->input_dev);
+ dev_hold(hw_metadata->input_dev);
return hw_metadata;
@@ -867,8 +866,7 @@ free_hw_metadata:
static void
net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata)
{
- if (hw_metadata->input_dev)
- dev_put(hw_metadata->input_dev);
+ dev_put(hw_metadata->input_dev);
kfree(hw_metadata->fa_cookie);
kfree(hw_metadata->trap_name);
kfree(hw_metadata->trap_group_name);
diff --git a/net/core/dst.c b/net/core/dst.c
index fb3bcba87744..497ef9b3fc6a 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -49,8 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
unsigned short flags)
{
dst->dev = dev;
- if (dev)
- dev_hold(dev);
+ dev_hold(dev);
dst->ops = ops;
dst_init_metrics(dst, dst_default_metrics.metrics, true);
dst->expires = 0UL;
@@ -118,8 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
if (dst->ops->destroy)
dst->ops->destroy(dst);
- if (dst->dev)
- dev_put(dst->dev);
+ dev_put(dst->dev);
lwtstate_put(dst->lwtstate);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index a9f937975080..79df7cd9dbc1 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -57,7 +57,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
{
struct fib_rule *r;
- r = kzalloc(ops->rule_size, GFP_KERNEL);
+ r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
if (r == NULL)
return -ENOMEM;
@@ -541,7 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+ nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
if (!nlrule) {
err = -ENOMEM;
goto errout;
diff --git a/net/core/filter.c b/net/core/filter.c
index d70187ce851b..2e32cee2c469 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -77,6 +77,7 @@
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
#include <net/tls.h>
+#include <net/xdp.h>
static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -113,7 +114,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
* Run the eBPF program and then cut skb->data to correct size returned by
* the program. If pkt_len is 0 we toss packet. If skb->len is smaller
* than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
+ * wrapper to bpf_prog_run. It returns 0 if the packet should
* be accepted or -EPERM if the packet should be tossed.
*
*/
@@ -2179,17 +2180,9 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
skb->tstamp = 0;
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
- struct sk_buff *skb2;
-
- skb2 = skb_realloc_headroom(skb, hh_len);
- if (unlikely(!skb2)) {
- kfree_skb(skb);
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
return -ENOMEM;
- }
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
- consume_skb(skb);
- skb = skb2;
}
rcu_read_lock_bh();
@@ -2213,8 +2206,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
}
rcu_read_unlock_bh();
if (dst)
- IP6_INC_STATS(dev_net(dst->dev),
- ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop:
kfree_skb(skb);
return -ENETDOWN;
@@ -2286,17 +2278,9 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
skb->tstamp = 0;
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
- struct sk_buff *skb2;
-
- skb2 = skb_realloc_headroom(skb, hh_len);
- if (unlikely(!skb2)) {
- kfree_skb(skb);
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
return -ENOMEM;
- }
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
- consume_skb(skb);
- skb = skb2;
}
rcu_read_lock_bh();
@@ -3880,8 +3864,7 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
- if (unlikely((metalen & (sizeof(__u32) - 1)) ||
- (metalen > 32)))
+ if (unlikely(xdp_metalen_invalid(metalen)))
return -EACCES;
xdp->data_meta = meta;
@@ -3950,6 +3933,31 @@ void bpf_clear_redirect_map(struct bpf_map *map)
}
}
+DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
+EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
+
+u32 xdp_master_redirect(struct xdp_buff *xdp)
+{
+ struct net_device *master, *slave;
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+ master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
+ slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
+ if (slave && slave != xdp->rxq->dev) {
+ /* The target device is different from the receiving device, so
+ * redirect it to the new device.
+ * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
+ * drivers to unmap the packet from their rx ring.
+ */
+ ri->tgt_index = slave->ifindex;
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+ return XDP_REDIRECT;
+ }
+ return XDP_TX;
+}
+EXPORT_SYMBOL_GPL(xdp_master_redirect);
+
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
@@ -4040,8 +4048,12 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
goto err;
consume_skb(skb);
break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_generic_redirect(fwd, skb);
+ if (unlikely(err))
+ goto err;
+ break;
default:
- /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
err = -EBADRQC;
goto err;
}
@@ -4664,6 +4676,30 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
.arg1_type = ARG_PTR_TO_CTX_OR_NULL,
};
+BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
+ .func = bpf_get_netns_cookie_sock_ops,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
+ .func = bpf_get_netns_cookie_sk_msg,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
struct sock *sk = sk_to_full_sk(skb->sk);
@@ -5012,6 +5048,46 @@ err_clear:
return -EINVAL;
}
+BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ if (level == SOL_TCP && optname == TCP_CONGESTION) {
+ if (optlen >= sizeof("cdg") - 1 &&
+ !strncmp("cdg", optval, optlen))
+ return -ENOTSUPP;
+ }
+
+ return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_proto = {
+ .func = bpf_sk_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return _bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_getsockopt_proto = {
+ .func = bpf_sk_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
int, level, int, optname, char *, optval, int, optlen)
{
@@ -7445,6 +7521,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_ops_proto;
#ifdef CONFIG_INET
case BPF_FUNC_load_hdr_opt:
return &bpf_sock_ops_load_hdr_opt_proto;
@@ -7491,6 +7569,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sk_msg_proto;
#ifdef CONFIG_CGROUPS
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
@@ -10069,7 +10149,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
enum sk_action action;
bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
- action = BPF_PROG_RUN(prog, &reuse_kern);
+ action = bpf_prog_run(prog, &reuse_kern);
if (action == SK_PASS)
return reuse_kern.selected_sk;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2aadbfc5193b..bac0184cf3de 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1056,8 +1056,10 @@ proto_again:
FLOW_DISSECTOR_KEY_IPV4_ADDRS,
target_container);
- memcpy(&key_addrs->v4addrs, &iph->saddr,
- sizeof(key_addrs->v4addrs));
+ memcpy(&key_addrs->v4addrs.src, &iph->saddr,
+ sizeof(key_addrs->v4addrs.src));
+ memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v4addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
}
@@ -1101,8 +1103,10 @@ proto_again:
FLOW_DISSECTOR_KEY_IPV6_ADDRS,
target_container);
- memcpy(&key_addrs->v6addrs, &iph->saddr,
- sizeof(key_addrs->v6addrs));
+ memcpy(&key_addrs->v6addrs.src, &iph->saddr,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v6addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -1504,7 +1508,7 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow)
}
EXPORT_SYMBOL(flow_get_u32_dst);
-/* Sort the source and destination IP (and the ports if the IP are the same),
+/* Sort the source and destination IP and the ports,
* to have consistent hash within the two directions
*/
static inline void __flow_hash_consistentify(struct flow_keys *keys)
@@ -1515,11 +1519,11 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
addr_diff = (__force u32)keys->addrs.v4addrs.dst -
(__force u32)keys->addrs.v4addrs.src;
- if ((addr_diff < 0) ||
- (addr_diff == 0 &&
- ((__force u16)keys->ports.dst <
- (__force u16)keys->ports.src))) {
+ if (addr_diff < 0)
swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
+
+ if ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src) {
swap(keys->ports.src, keys->ports.dst);
}
break;
@@ -1527,13 +1531,13 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
addr_diff = memcmp(&keys->addrs.v6addrs.dst,
&keys->addrs.v6addrs.src,
sizeof(keys->addrs.v6addrs.dst));
- if ((addr_diff < 0) ||
- (addr_diff == 0 &&
- ((__force u16)keys->ports.dst <
- (__force u16)keys->ports.src))) {
+ if (addr_diff < 0) {
for (i = 0; i < 4; i++)
swap(keys->addrs.v6addrs.src.s6_addr32[i],
keys->addrs.v6addrs.dst.s6_addr32[i]);
+ }
+ if ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src) {
swap(keys->ports.src, keys->ports.dst);
}
break;
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 715b67f6c62f..6beaea13564a 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -321,13 +321,13 @@ EXPORT_SYMBOL(flow_block_cb_setup_simple);
static DEFINE_MUTEX(flow_indr_block_lock);
static LIST_HEAD(flow_block_indr_list);
static LIST_HEAD(flow_block_indr_dev_list);
+static LIST_HEAD(flow_indir_dev_list);
struct flow_indr_dev {
struct list_head list;
flow_indr_block_bind_cb_t *cb;
void *cb_priv;
refcount_t refcnt;
- struct rcu_head rcu;
};
static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb,
@@ -346,6 +346,33 @@ static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb,
return indr_dev;
}
+struct flow_indir_dev_info {
+ void *data;
+ struct net_device *dev;
+ struct Qdisc *sch;
+ enum tc_setup_type type;
+ void (*cleanup)(struct flow_block_cb *block_cb);
+ struct list_head list;
+ enum flow_block_command command;
+ enum flow_block_binder_type binder_type;
+ struct list_head *cb_list;
+};
+
+static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
+{
+ struct flow_block_offload bo;
+ struct flow_indir_dev_info *cur;
+
+ list_for_each_entry(cur, &flow_indir_dev_list, list) {
+ memset(&bo, 0, sizeof(bo));
+ bo.command = cur->command;
+ bo.binder_type = cur->binder_type;
+ INIT_LIST_HEAD(&bo.cb_list);
+ cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup);
+ list_splice(&bo.cb_list, cur->cb_list);
+ }
+}
+
int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
{
struct flow_indr_dev *indr_dev;
@@ -367,6 +394,7 @@ int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
}
list_add(&indr_dev->list, &flow_block_indr_dev_list);
+ existing_qdiscs_register(cb, cb_priv);
mutex_unlock(&flow_indr_block_lock);
return 0;
@@ -463,7 +491,59 @@ out:
}
EXPORT_SYMBOL(flow_indr_block_cb_alloc);
-int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
+static struct flow_indir_dev_info *find_indir_dev(void *data)
+{
+ struct flow_indir_dev_info *cur;
+
+ list_for_each_entry(cur, &flow_indir_dev_list, list) {
+ if (cur->data == data)
+ return cur;
+ }
+ return NULL;
+}
+
+static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch,
+ enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb),
+ struct flow_block_offload *bo)
+{
+ struct flow_indir_dev_info *info;
+
+ info = find_indir_dev(data);
+ if (info)
+ return -EEXIST;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->data = data;
+ info->dev = dev;
+ info->sch = sch;
+ info->type = type;
+ info->cleanup = cleanup;
+ info->command = bo->command;
+ info->binder_type = bo->binder_type;
+ info->cb_list = bo->cb_list_head;
+
+ list_add(&info->list, &flow_indir_dev_list);
+ return 0;
+}
+
+static int indir_dev_remove(void *data)
+{
+ struct flow_indir_dev_info *info;
+
+ info = find_indir_dev(data);
+ if (!info)
+ return -ENOENT;
+
+ list_del(&info->list);
+
+ kfree(info);
+ return 0;
+}
+
+int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
enum tc_setup_type type, void *data,
struct flow_block_offload *bo,
void (*cleanup)(struct flow_block_cb *block_cb))
@@ -471,6 +551,12 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
struct flow_indr_dev *this;
mutex_lock(&flow_indr_block_lock);
+
+ if (bo->command == FLOW_BLOCK_BIND)
+ indir_dev_add(data, dev, sch, type, cleanup, bo);
+ else if (bo->command == FLOW_BLOCK_UNBIND)
+ indir_dev_remove(data);
+
list_for_each_entry(this, &flow_block_indr_dev_list, list)
this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 75431ca9300f..1a455847da54 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -158,7 +158,7 @@ static void linkwatch_do_dev(struct net_device *dev)
clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
rfc2863_policy(dev);
- if (dev->flags & IFF_UP && netif_device_present(dev)) {
+ if (dev->flags & IFF_UP) {
if (netif_carrier_ok(dev))
dev_activate(dev);
else
@@ -204,7 +204,8 @@ static void __linkwatch_run_queue(int urgent_only)
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
- if (urgent_only && !linkwatch_urgent_event(dev)) {
+ if (!netif_device_present(dev) ||
+ (urgent_only && !linkwatch_urgent_event(dev))) {
list_add_tail(&dev->link_watch_list, &lweventlist);
continue;
}
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 8ec7d13d2860..2820aca2173a 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -23,6 +23,9 @@
#include <net/ip6_fib.h>
#include <net/rtnh.h>
+DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
+EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
+
#ifdef CONFIG_MODULES
static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
@@ -43,6 +46,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "SEG6LOCAL";
case LWTUNNEL_ENCAP_RPL:
return "RPL";
+ case LWTUNNEL_ENCAP_IOAM6:
+ return "IOAM6";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 53e85c70c6e5..2d5bc3a75fae 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -741,12 +741,10 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
write_pnet(&n->net, net);
memcpy(n->key, pkey, key_len);
n->dev = dev;
- if (dev)
- dev_hold(dev);
+ dev_hold(dev);
if (tbl->pconstructor && tbl->pconstructor(n)) {
- if (dev)
- dev_put(dev);
+ dev_put(dev);
kfree(n);
n = NULL;
goto out;
@@ -778,8 +776,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
write_unlock_bh(&tbl->lock);
if (tbl->pdestructor)
tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
+ dev_put(n->dev);
kfree(n);
return 0;
}
@@ -812,8 +809,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
n->next = NULL;
if (tbl->pdestructor)
tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
+ dev_put(n->dev);
kfree(n);
}
return -ENOENT;
@@ -1662,8 +1658,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
list_del(&parms->list);
parms->dead = 1;
write_unlock_bh(&tbl->lock);
- if (parms->dev)
- dev_put(parms->dev);
+ dev_put(parms->dev);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
EXPORT_SYMBOL(neigh_parms_release);
@@ -2533,6 +2528,13 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)
return false;
master = dev ? netdev_master_upper_dev_get(dev) : NULL;
+
+ /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another
+ * invalid value for ifindex to denote "no master".
+ */
+ if (master_idx == -1)
+ return !!master;
+
if (!master || master->ifindex != master_idx)
return true;
@@ -3315,12 +3317,13 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v)
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
+ seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
return 0;
}
- seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
- "%08lx %08lx %08lx %08lx %08lx %08lx\n",
+ seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
+ "%08lx %08lx %08lx "
+ "%08lx %08lx %08lx\n",
atomic_read(&tbl->entries),
st->allocs,
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index d8b9dbabd4a4..eab5fc88a002 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -77,8 +77,8 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
struct rtnl_link_stats64 temp;
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
- seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
- "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
+ seq_printf(seq, "%9s: %16llu %12llu %4llu %6llu %4llu %5llu %10llu %9llu "
+ "%16llu %12llu %4llu %6llu %4llu %5llu %7llu %10llu\n",
dev->name, stats->rx_bytes, stats->rx_packets,
stats->rx_errors,
stats->rx_dropped + stats->rx_missed_errors,
@@ -103,11 +103,11 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
static int dev_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Inter-| Receive "
- " | Transmit\n"
- " face |bytes packets errs drop fifo frame "
- "compressed multicast|bytes packets errs "
- "drop fifo colls carrier compressed\n");
+ seq_puts(seq, "Interface| Receive "
+ " | Transmit\n"
+ " | bytes packets errs drop fifo frame "
+ "compressed multicast| bytes packets errs "
+ " drop fifo colls carrier compressed\n");
else
dev_seq_printf_stats(seq, v);
return 0;
@@ -259,14 +259,14 @@ static int ptype_seq_show(struct seq_file *seq, void *v)
struct packet_type *pt = v;
if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Type Device Function\n");
+ seq_puts(seq, "Type Device Function\n");
else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
if (pt->type == htons(ETH_P_ALL))
seq_puts(seq, "ALL ");
else
seq_printf(seq, "%04x", ntohs(pt->type));
- seq_printf(seq, " %-8s %ps\n",
+ seq_printf(seq, " %-9s %ps\n",
pt->dev ? pt->dev->name : "", pt->func);
}
@@ -327,12 +327,14 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v)
struct netdev_hw_addr *ha;
struct net_device *dev = v;
- if (v == SEQ_START_TOKEN)
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "Ifindex Interface Refcount Global_use Address\n");
return 0;
+ }
netif_addr_lock_bh(dev);
netdev_for_each_mc_addr(ha, dev) {
- seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n",
+ seq_printf(seq, "%-7d %-9s %-8d %-10d %*phN\n",
dev->ifindex, dev->name,
ha->refcount, ha->global_use,
(int)dev->addr_len, ha->addr);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 9b5a767eddd5..a448a9b5bb2d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -98,7 +98,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
}
ng = net_alloc_generic();
- if (ng == NULL)
+ if (!ng)
return -ENOMEM;
/*
@@ -148,13 +148,6 @@ out:
return err;
}
-static void ops_free(const struct pernet_operations *ops, struct net *net)
-{
- if (ops->id && ops->size) {
- kfree(net_generic(net, *ops->id));
- }
-}
-
static void ops_pre_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
@@ -184,7 +177,7 @@ static void ops_free_list(const struct pernet_operations *ops,
struct net *net;
if (ops->size && ops->id) {
list_for_each_entry(net, net_exit_list, exit_list)
- ops_free(ops, net);
+ kfree(net_generic(net, *ops->id));
}
}
@@ -433,15 +426,18 @@ out_free:
static void net_free(struct net *net)
{
- kfree(rcu_access_pointer(net->gen));
- kmem_cache_free(net_cachep, net);
+ if (refcount_dec_and_test(&net->passive)) {
+ kfree(rcu_access_pointer(net->gen));
+ kmem_cache_free(net_cachep, net);
+ }
}
void net_drop_ns(void *p)
{
- struct net *ns = p;
- if (ns && refcount_dec_and_test(&ns->passive))
- net_free(ns);
+ struct net *net = (struct net *)p;
+
+ if (net)
+ net_free(net);
}
struct net *copy_net_ns(unsigned long flags,
@@ -479,7 +475,7 @@ struct net *copy_net_ns(unsigned long flags,
put_userns:
key_remove_domain(net->key_domain);
put_user_ns(user_ns);
- net_drop_ns(net);
+ net_free(net);
dec_ucounts:
dec_net_namespaces(ucounts);
return ERR_PTR(rv);
@@ -611,7 +607,7 @@ static void cleanup_net(struct work_struct *work)
dec_net_namespaces(net->ucounts);
key_remove_domain(net->key_domain);
put_user_ns(net->user_ns);
- net_drop_ns(net);
+ net_free(net);
}
}
@@ -1120,6 +1116,14 @@ static int __init net_ns_init(void)
pure_initcall(net_ns_init);
+static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
+{
+ ops_pre_exit_list(ops, net_exit_list);
+ synchronize_rcu();
+ ops_exit_list(ops, net_exit_list);
+ ops_free_list(ops, net_exit_list);
+}
+
#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
@@ -1145,10 +1149,7 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
- ops_pre_exit_list(ops, &net_exit_list);
- synchronize_rcu();
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+ free_exit_list(ops, &net_exit_list);
return error;
}
@@ -1161,10 +1162,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
- ops_pre_exit_list(ops, &net_exit_list);
- synchronize_rcu();
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+
+ free_exit_list(ops, &net_exit_list);
}
#else
@@ -1187,10 +1186,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
} else {
LIST_HEAD(net_exit_list);
list_add(&init_net.exit_list, &net_exit_list);
- ops_pre_exit_list(ops, &net_exit_list);
- synchronize_rcu();
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+ free_exit_list(ops, &net_exit_list);
}
}
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5e4eb45b139c..1a6978427d6c 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -24,6 +24,8 @@
#define DEFER_TIME (msecs_to_jiffies(1000))
#define DEFER_WARN_INTERVAL (60 * HZ)
+#define BIAS_MAX LONG_MAX
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -67,6 +69,10 @@ static int page_pool_init(struct page_pool *pool,
*/
}
+ if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
+ pool->p.flags & PP_FLAG_PAGE_FRAG)
+ return -EINVAL;
+
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
@@ -206,6 +212,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
return true;
}
+static void page_pool_set_pp_info(struct page_pool *pool,
+ struct page *page)
+{
+ page->pp = pool;
+ page->pp_magic |= PP_SIGNATURE;
+}
+
+static void page_pool_clear_pp_info(struct page *page)
+{
+ page->pp_magic = 0;
+ page->pp = NULL;
+}
+
static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
gfp_t gfp)
{
@@ -222,7 +241,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
return NULL;
}
- page->pp_magic |= PP_SIGNATURE;
+ page_pool_set_pp_info(pool, page);
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
@@ -266,7 +285,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
put_page(page);
continue;
}
- page->pp_magic |= PP_SIGNATURE;
+
+ page_pool_set_pp_info(pool, page);
pool->alloc.cache[pool->alloc.count++] = page;
/* Track how many pages are held 'in-flight' */
pool->pages_state_hold_cnt++;
@@ -345,12 +365,12 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
DMA_ATTR_SKIP_CPU_SYNC);
page_pool_set_dma_addr(page, 0);
skip_dma_unmap:
- page->pp_magic = 0;
+ page_pool_clear_pp_info(page);
/* This may be the last page returned, releasing the pool, so
* it is not safe to reference pool afterwards.
*/
- count = atomic_inc_return(&pool->pages_state_release_cnt);
+ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
trace_page_pool_state_release(pool, page, count);
}
EXPORT_SYMBOL(page_pool_release_page);
@@ -405,6 +425,11 @@ static __always_inline struct page *
__page_pool_put_page(struct page_pool *pool, struct page *page,
unsigned int dma_sync_size, bool allow_direct)
{
+ /* It is not the last user for the page frag case */
+ if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
+ page_pool_atomic_sub_frag_count_return(page, 1))
+ return NULL;
+
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
* regular page allocator APIs.
@@ -497,6 +522,84 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data,
}
EXPORT_SYMBOL(page_pool_put_page_bulk);
+static struct page *page_pool_drain_frag(struct page_pool *pool,
+ struct page *page)
+{
+ long drain_count = BIAS_MAX - pool->frag_users;
+
+ /* Some user is still using the page frag */
+ if (likely(page_pool_atomic_sub_frag_count_return(page,
+ drain_count)))
+ return NULL;
+
+ if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+ page_pool_dma_sync_for_device(pool, page, -1);
+
+ return page;
+ }
+
+ page_pool_return_page(pool, page);
+ return NULL;
+}
+
+static void page_pool_free_frag(struct page_pool *pool)
+{
+ long drain_count = BIAS_MAX - pool->frag_users;
+ struct page *page = pool->frag_page;
+
+ pool->frag_page = NULL;
+
+ if (!page ||
+ page_pool_atomic_sub_frag_count_return(page, drain_count))
+ return;
+
+ page_pool_return_page(pool, page);
+}
+
+struct page *page_pool_alloc_frag(struct page_pool *pool,
+ unsigned int *offset,
+ unsigned int size, gfp_t gfp)
+{
+ unsigned int max_size = PAGE_SIZE << pool->p.order;
+ struct page *page = pool->frag_page;
+
+ if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
+ size > max_size))
+ return NULL;
+
+ size = ALIGN(size, dma_get_cache_alignment());
+ *offset = pool->frag_offset;
+
+ if (page && *offset + size > max_size) {
+ page = page_pool_drain_frag(pool, page);
+ if (page)
+ goto frag_reset;
+ }
+
+ if (!page) {
+ page = page_pool_alloc_pages(pool, gfp);
+ if (unlikely(!page)) {
+ pool->frag_page = NULL;
+ return NULL;
+ }
+
+ pool->frag_page = page;
+
+frag_reset:
+ pool->frag_users = 1;
+ *offset = 0;
+ pool->frag_offset = size;
+ page_pool_set_frag_count(page, BIAS_MAX);
+ return page;
+ }
+
+ pool->frag_users++;
+ pool->frag_offset = *offset + size;
+ return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_frag);
+
static void page_pool_empty_ring(struct page_pool *pool)
{
struct page *page;
@@ -602,6 +705,8 @@ void page_pool_destroy(struct page_pool *pool)
if (!page_pool_put(pool))
return;
+ page_pool_free_frag(pool);
+
if (!page_pool_release(pool))
return;
@@ -634,7 +739,15 @@ bool page_pool_return_skb_page(struct page *page)
struct page_pool *pp;
page = compound_head(page);
- if (unlikely(page->pp_magic != PP_SIGNATURE))
+
+ /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
+ * in order to preserve any existing bits, such as bit 0 for the
+ * head page of compound page and bit 1 for pfmemalloc page, so
+ * mask those bits for freeing side when doing below checking,
+ * and page_is_pfmemalloc() is checked in __page_pool_put_page()
+ * to avoid recycling the pfmemalloc page.
+ */
+ if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
return false;
pp = page->pp;
@@ -644,7 +757,6 @@ bool page_pool_return_skb_page(struct page *page)
* The page will be returned to the pool here regardless of the
* 'flipped' fragment being in use or not.
*/
- page->pp = NULL;
page_pool_put_full_page(pp, page, false);
return true;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 7e258d255e90..9e5a3249373c 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -175,6 +175,9 @@
#define IP_NAME_SZ 32
#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
#define MPLS_STACK_BOTTOM htonl(0x00000100)
+/* Max number of internet mix entries that can be specified in imix_weights. */
+#define MAX_IMIX_ENTRIES 20
+#define IMIX_PRECISION 100 /* Precision of IMIX distribution */
#define func_enter() pr_debug("entering %s\n", __func__);
@@ -242,6 +245,12 @@ static char *pkt_flag_names[] = {
#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4)
#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4)
+struct imix_pkt {
+ u64 size;
+ u64 weight;
+ u64 count_so_far;
+};
+
struct flow_state {
__be32 cur_daddr;
int count;
@@ -343,6 +352,12 @@ struct pktgen_dev {
__u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
(see RFC 3260, sec. 4) */
+ /* IMIX */
+ unsigned int n_imix_entries;
+ struct imix_pkt imix_entries[MAX_IMIX_ENTRIES];
+ /* Maps 0-IMIX_PRECISION range to imix_entry based on probability*/
+ __u8 imix_distribution[IMIX_PRECISION];
+
/* MPLS */
unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */
__be32 labels[MAX_MPLS_LABELS];
@@ -471,6 +486,7 @@ static void pktgen_stop_all_threads(struct pktgen_net *pn);
static void pktgen_stop(struct pktgen_thread *t);
static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
+static void fill_imix_distribution(struct pktgen_dev *pkt_dev);
/* Module parameters, defaults. */
static int pg_count_d __read_mostly = 1000;
@@ -552,6 +568,16 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
(unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size,
pkt_dev->max_pkt_size);
+ if (pkt_dev->n_imix_entries > 0) {
+ seq_puts(seq, " imix_weights: ");
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ seq_printf(seq, "%llu,%llu ",
+ pkt_dev->imix_entries[i].size,
+ pkt_dev->imix_entries[i].weight);
+ }
+ seq_puts(seq, "\n");
+ }
+
seq_printf(seq,
" frags: %d delay: %llu clone_skb: %d ifname: %s\n",
pkt_dev->nfrags, (unsigned long long) pkt_dev->delay,
@@ -669,6 +695,18 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
(unsigned long long)pkt_dev->sofar,
(unsigned long long)pkt_dev->errors);
+ if (pkt_dev->n_imix_entries > 0) {
+ int i;
+
+ seq_puts(seq, " imix_size_counts: ");
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ seq_printf(seq, "%llu,%llu ",
+ pkt_dev->imix_entries[i].size,
+ pkt_dev->imix_entries[i].count_so_far);
+ }
+ seq_puts(seq, "\n");
+ }
+
seq_printf(seq,
" started: %lluus stopped: %lluus idle: %lluus\n",
(unsigned long long) ktime_to_us(pkt_dev->started_at),
@@ -792,6 +830,62 @@ done_str:
return i;
}
+/* Parses imix entries from user buffer.
+ * The user buffer should consist of imix entries separated by spaces
+ * where each entry consists of size and weight delimited by commas.
+ * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example.
+ */
+static ssize_t get_imix_entries(const char __user *buffer,
+ struct pktgen_dev *pkt_dev)
+{
+ const int max_digits = 10;
+ int i = 0;
+ long len;
+ char c;
+
+ pkt_dev->n_imix_entries = 0;
+
+ do {
+ unsigned long weight;
+ unsigned long size;
+
+ len = num_arg(&buffer[i], max_digits, &size);
+ if (len < 0)
+ return len;
+ i += len;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ /* Check for comma between size_i and weight_i */
+ if (c != ',')
+ return -EINVAL;
+ i++;
+
+ if (size < 14 + 20 + 8)
+ size = 14 + 20 + 8;
+
+ len = num_arg(&buffer[i], max_digits, &weight);
+ if (len < 0)
+ return len;
+ if (weight <= 0)
+ return -EINVAL;
+
+ pkt_dev->imix_entries[pkt_dev->n_imix_entries].size = size;
+ pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight;
+
+ i += len;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+
+ i++;
+ pkt_dev->n_imix_entries++;
+
+ if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES)
+ return -E2BIG;
+ } while (c == ' ');
+
+ return i;
+}
+
static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
{
unsigned int n = 0;
@@ -960,6 +1054,20 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
+ if (!strcmp(name, "imix_weights")) {
+ if (pkt_dev->clone_skb > 0)
+ return -EINVAL;
+
+ len = get_imix_entries(&user_buffer[i], pkt_dev);
+ if (len < 0)
+ return len;
+
+ fill_imix_distribution(pkt_dev);
+
+ i += len;
+ return count;
+ }
+
if (!strcmp(name, "debug")) {
len = num_arg(&user_buffer[i], 10, &value);
if (len < 0)
@@ -1082,10 +1190,16 @@ static ssize_t pktgen_if_write(struct file *file,
len = num_arg(&user_buffer[i], 10, &value);
if (len < 0)
return len;
+ /* clone_skb is not supported for netif_receive xmit_mode and
+ * IMIX mode.
+ */
if ((value > 0) &&
((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
return -ENOTSUPP;
+ if (value > 0 && pkt_dev->n_imix_entries > 0)
+ return -EINVAL;
+
i += len;
pkt_dev->clone_skb = value;
@@ -1190,11 +1304,6 @@ static ssize_t pktgen_if_write(struct file *file,
* pktgen_xmit() is called
*/
pkt_dev->last_ok = 1;
-
- /* override clone_skb if user passed default value
- * at module loading time
- */
- pkt_dev->clone_skb = 0;
} else if (strcmp(f, "queue_xmit") == 0) {
pkt_dev->xmit_mode = M_QUEUE_XMIT;
pkt_dev->last_ok = 1;
@@ -2477,6 +2586,14 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
t = pkt_dev->min_pkt_size;
}
pkt_dev->cur_pkt_size = t;
+ } else if (pkt_dev->n_imix_entries > 0) {
+ struct imix_pkt *entry;
+ __u32 t = prandom_u32() % IMIX_PRECISION;
+ __u8 entry_index = pkt_dev->imix_distribution[t];
+
+ entry = &pkt_dev->imix_entries[entry_index];
+ entry->count_so_far++;
+ pkt_dev->cur_pkt_size = entry->size;
}
set_cur_queue_map(pkt_dev);
@@ -2484,6 +2601,32 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
pkt_dev->flows[flow].count++;
}
+static void fill_imix_distribution(struct pktgen_dev *pkt_dev)
+{
+ int cumulative_probabilites[MAX_IMIX_ENTRIES];
+ int j = 0;
+ __u64 cumulative_prob = 0;
+ __u64 total_weight = 0;
+ int i = 0;
+
+ for (i = 0; i < pkt_dev->n_imix_entries; i++)
+ total_weight += pkt_dev->imix_entries[i].weight;
+
+ /* Fill cumulative_probabilites with sum of normalized probabilities */
+ for (i = 0; i < pkt_dev->n_imix_entries - 1; i++) {
+ cumulative_prob += div64_u64(pkt_dev->imix_entries[i].weight *
+ IMIX_PRECISION,
+ total_weight);
+ cumulative_probabilites[i] = cumulative_prob;
+ }
+ cumulative_probabilites[pkt_dev->n_imix_entries - 1] = 100;
+
+ for (i = 0; i < IMIX_PRECISION; i++) {
+ if (i == cumulative_probabilites[j])
+ j++;
+ pkt_dev->imix_distribution[i] = j;
+ }
+}
#ifdef CONFIG_XFRM
static u32 pktgen_dst_metrics[RTAX_MAX + 1] = {
@@ -3145,7 +3288,19 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC,
ktime_to_ns(elapsed));
- bps = pps * 8 * pkt_dev->cur_pkt_size;
+ if (pkt_dev->n_imix_entries > 0) {
+ int i;
+ struct imix_pkt *entry;
+
+ bps = 0;
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ entry = &pkt_dev->imix_entries[i];
+ bps += entry->size * entry->count_so_far;
+ }
+ bps = div64_u64(bps * 8 * NSEC_PER_SEC, ktime_to_ns(elapsed));
+ } else {
+ bps = pps * 8 * pkt_dev->cur_pkt_size;
+ }
mbps = bps;
do_div(mbps, 1000000);
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index e33fde06d528..dd4cf01d1e0a 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -103,7 +103,7 @@ static struct bpf_prog *ptp_insns __read_mostly;
unsigned int ptp_classify_raw(const struct sk_buff *skb)
{
- return BPF_PROG_RUN(ptp_insns, skb);
+ return bpf_prog_run(ptp_insns, skb);
}
EXPORT_SYMBOL_GPL(ptp_classify_raw);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f6af3e74fc44..972c8cb303a5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -710,15 +710,8 @@ out:
int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
{
struct sock *rtnl = net->rtnl;
- int err = 0;
- NETLINK_CB(skb).dst_group = group;
- if (echo)
- refcount_inc(&skb->users);
- netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
- if (echo)
- err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
- return err;
+ return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL);
}
int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
@@ -733,12 +726,8 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
struct nlmsghdr *nlh, gfp_t flags)
{
struct sock *rtnl = net->rtnl;
- int report = 0;
- if (nlh)
- report = nlmsg_report(nlh);
-
- nlmsg_notify(rtnl, skb, pid, group, report, flags);
+ nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags);
}
EXPORT_SYMBOL(rtnl_notify);
@@ -1970,6 +1959,13 @@ static bool link_master_filtered(struct net_device *dev, int master_idx)
return false;
master = netdev_master_upper_dev_get(dev);
+
+ /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need
+ * another invalid value for ifindex to denote "no master".
+ */
+ if (master_idx == -1)
+ return !!master;
+
if (!master || master->ifindex != master_idx)
return true;
@@ -2268,7 +2264,8 @@ invalid_attr:
return -EINVAL;
}
-static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
{
if (dev) {
if (tb[IFLA_ADDRESS] &&
@@ -2295,7 +2292,7 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
return -EOPNOTSUPP;
if (af_ops->validate_link_af) {
- err = af_ops->validate_link_af(dev, af);
+ err = af_ops->validate_link_af(dev, af, extack);
if (err < 0)
return err;
}
@@ -2603,11 +2600,12 @@ static int do_setlink(const struct sk_buff *skb,
const struct net_device_ops *ops = dev->netdev_ops;
int err;
- err = validate_linkmsg(dev, tb);
+ err = validate_linkmsg(dev, tb, extack);
if (err < 0)
return err;
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) {
+ const char *pat = ifname && ifname[0] ? ifname : NULL;
struct net *net;
int new_ifindex;
@@ -2623,7 +2621,7 @@ static int do_setlink(const struct sk_buff *skb,
else
new_ifindex = 0;
- err = __dev_change_net_namespace(dev, net, ifname, new_ifindex);
+ err = __dev_change_net_namespace(dev, net, pat, new_ifindex);
put_net(net);
if (err)
goto errout;
@@ -3301,7 +3299,7 @@ replay:
m_ops = master_dev->rtnl_link_ops;
}
- err = validate_linkmsg(dev, tb);
+ err = validate_linkmsg(dev, tb, extack);
if (err < 0)
return err;
diff --git a/net/core/scm.c b/net/core/scm.c
index ae3085d9aae8..5c356f0dee30 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -79,7 +79,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
if (!fpl)
{
- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT);
if (!fpl)
return -ENOMEM;
*fplp = fpl;
@@ -355,7 +355,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
return NULL;
new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
diff --git a/net/core/selftests.c b/net/core/selftests.c
index ba7b0171974c..9077fa969892 100644
--- a/net/core/selftests.c
+++ b/net/core/selftests.c
@@ -318,6 +318,15 @@ static int net_test_phy_loopback_udp(struct net_device *ndev)
return __net_test_loopback(ndev, &attr);
}
+static int net_test_phy_loopback_udp_mtu(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.max_size = ndev->mtu;
+ return __net_test_loopback(ndev, &attr);
+}
+
static int net_test_phy_loopback_tcp(struct net_device *ndev)
{
struct net_packet_attrs attr = { };
@@ -345,6 +354,9 @@ static const struct net_test {
.name = "PHY internal loopback, UDP ",
.fn = net_test_phy_loopback_udp,
}, {
+ .name = "PHY internal loopback, MTU ",
+ .fn = net_test_phy_loopback_udp_mtu,
+ }, {
.name = "PHY internal loopback, TCP ",
.fn = net_test_phy_loopback_tcp,
}, {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 12aabcda6db2..f9311762cc47 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -156,7 +156,7 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
void *data;
fragsz = SKB_DATA_ALIGN(fragsz);
- if (in_irq() || irqs_disabled()) {
+ if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
} else {
@@ -502,7 +502,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- if (in_irq() || irqs_disabled()) {
+ if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
pfmemalloc = nc->pfmemalloc;
@@ -663,7 +663,7 @@ static void skb_release_data(struct sk_buff *skb)
if (skb->cloned &&
atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
&shinfo->dataref))
- return;
+ goto exit;
skb_zcopy_clear(skb, true);
@@ -674,6 +674,17 @@ static void skb_release_data(struct sk_buff *skb)
kfree_skb_list(shinfo->frag_list);
skb_free_head(skb);
+exit:
+ /* When we clone an SKB we copy the reycling bit. The pp_recycle
+ * bit is only set on the head though, so in order to avoid races
+ * while trying to recycle fragments on __skb_frag_unref() we need
+ * to make one SKB responsible for triggering the recycle path.
+ * So disable the recycling bit if an SKB is cloned and we have
+ * additional references to to the fragmented part of the SKB.
+ * Eventually the last SKB will have the recycling bit set and it's
+ * dataref set to 0, which will trigger the recycling
+ */
+ skb->pp_recycle = 0;
}
/*
@@ -713,7 +724,7 @@ void skb_release_head_state(struct sk_buff *skb)
{
skb_dst_drop(skb);
if (skb->destructor) {
- WARN_ON(in_irq());
+ WARN_ON(in_hardirq());
skb->destructor(skb);
}
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -943,8 +954,13 @@ void __kfree_skb_defer(struct sk_buff *skb)
void napi_skb_free_stolen_head(struct sk_buff *skb)
{
- skb_dst_drop(skb);
- skb_ext_put(skb);
+ if (unlikely(skb->slow_gro)) {
+ nf_reset_ct(skb);
+ skb_dst_drop(skb);
+ skb_ext_put(skb);
+ skb_orphan(skb);
+ skb->slow_gro = 0;
+ }
napi_skb_cache_put(skb);
}
@@ -1774,6 +1790,48 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
EXPORT_SYMBOL(skb_realloc_headroom);
/**
+ * skb_expand_head - reallocate header of &sk_buff
+ * @skb: buffer to reallocate
+ * @headroom: needed headroom
+ *
+ * Unlike skb_realloc_headroom, this one does not allocate a new skb
+ * if possible; copies skb->sk to new skb as needed
+ * and frees original skb in case of failures.
+ *
+ * It expect increased headroom and generates warning otherwise.
+ */
+
+struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
+{
+ int delta = headroom - skb_headroom(skb);
+
+ if (WARN_ONCE(delta <= 0,
+ "%s is expecting an increase in the headroom", __func__))
+ return skb;
+
+ /* pskb_expand_head() might crash, if skb is shared */
+ if (skb_shared(skb)) {
+ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+ if (likely(nskb)) {
+ if (skb->sk)
+ skb_set_owner_w(nskb, skb->sk);
+ consume_skb(skb);
+ } else {
+ kfree_skb(skb);
+ }
+ skb = nskb;
+ }
+ if (skb &&
+ pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+ kfree_skb(skb);
+ skb = NULL;
+ }
+ return skb;
+}
+EXPORT_SYMBOL(skb_expand_head);
+
+/**
* skb_copy_expand - copy and expand sk_buff
* @skb: buffer to copy
* @newheadroom: new free bytes at head
@@ -3010,8 +3068,11 @@ skb_zerocopy_headlen(const struct sk_buff *from)
if (!from->head_frag ||
skb_headlen(from) < L1_CACHE_BYTES ||
- skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
hlen = skb_headlen(from);
+ if (!hlen)
+ hlen = from->len;
+ }
if (skb_has_frag_list(from))
hlen = from->len;
@@ -3874,6 +3935,9 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
NAPI_GRO_CB(p)->last = skb;
NAPI_GRO_CB(p)->count++;
p->data_len += skb->len;
+
+ /* sk owenrship - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
p->truesize += skb->truesize;
p->len += skb->len;
@@ -4241,6 +4305,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
unsigned int headlen = skb_headlen(skb);
unsigned int len = skb_gro_len(skb);
unsigned int delta_truesize;
+ unsigned int new_truesize;
struct sk_buff *lp;
if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush))
@@ -4272,10 +4337,10 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
skb_frag_size_sub(frag, offset);
/* all fragments truesize : remove (head size + sk_buff) */
- delta_truesize = skb->truesize -
- SKB_TRUESIZE(skb_end_offset(skb));
+ new_truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ delta_truesize = skb->truesize - new_truesize;
- skb->truesize -= skb->data_len;
+ skb->truesize = new_truesize;
skb->len -= skb->data_len;
skb->data_len = 0;
@@ -4304,12 +4369,16 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
/* We dont need to clear skbinfo->nr_frags here */
- delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ delta_truesize = skb->truesize - new_truesize;
+ skb->truesize = new_truesize;
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
goto done;
}
merge:
+ /* sk owenrship - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
delta_truesize = skb->truesize;
if (offset > headlen) {
unsigned int eat = offset - headlen;
@@ -6434,6 +6503,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
new->chunks = newlen;
new->offset[id] = newoff;
set_active:
+ skb->slow_gro = 1;
skb->extensions = new;
skb->active_extensions |= 1 << id;
return skb_ext_get_ptr(new, id);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 9b6160a191f8..2d6249b28928 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -508,10 +508,8 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
if (skb_linearize(skb))
return -EAGAIN;
num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
- if (unlikely(num_sge < 0)) {
- kfree(msg);
+ if (unlikely(num_sge < 0))
return num_sge;
- }
copied = skb->len;
msg->sg.start = 0;
@@ -530,6 +528,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
{
struct sock *sk = psock->sk;
struct sk_msg *msg;
+ int err;
/* If we are receiving on the same sock skb->sk is already assigned,
* skip memory accounting and owner transition seeing it already set
@@ -548,7 +547,10 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
* into user buffers.
*/
skb_set_owner_r(skb, sk);
- return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+ err = sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+ if (err < 0)
+ kfree(msg);
+ return err;
}
/* Puts an skb on the ingress queue of the socket already assigned to the
@@ -559,12 +561,16 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
{
struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
struct sock *sk = psock->sk;
+ int err;
if (unlikely(!msg))
return -EAGAIN;
sk_msg_init(msg);
skb_set_owner_r(skb, sk);
- return sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+ err = sk_psock_skb_ingress_enqueue(skb, psock, sk, msg);
+ if (err < 0)
+ kfree(msg);
+ return err;
}
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
@@ -578,29 +584,42 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
return sk_psock_skb_ingress(psock, skb);
}
-static void sock_drop(struct sock *sk, struct sk_buff *skb)
+static void sk_psock_skb_state(struct sk_psock *psock,
+ struct sk_psock_work_state *state,
+ struct sk_buff *skb,
+ int len, int off)
{
- sk_drops_add(sk, skb);
- kfree_skb(skb);
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ state->skb = skb;
+ state->len = len;
+ state->off = off;
+ } else {
+ sock_drop(psock->sk, skb);
+ }
+ spin_unlock_bh(&psock->ingress_lock);
}
static void sk_psock_backlog(struct work_struct *work)
{
struct sk_psock *psock = container_of(work, struct sk_psock, work);
struct sk_psock_work_state *state = &psock->work_state;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
bool ingress;
u32 len, off;
int ret;
mutex_lock(&psock->work_mutex);
- if (state->skb) {
+ if (unlikely(state->skb)) {
+ spin_lock_bh(&psock->ingress_lock);
skb = state->skb;
len = state->len;
off = state->off;
state->skb = NULL;
- goto start;
+ spin_unlock_bh(&psock->ingress_lock);
}
+ if (skb)
+ goto start;
while ((skb = skb_dequeue(&psock->ingress_skb))) {
len = skb->len;
@@ -615,9 +634,8 @@ start:
len, ingress);
if (ret <= 0) {
if (ret == -EAGAIN) {
- state->skb = skb;
- state->len = len;
- state->off = off;
+ sk_psock_skb_state(psock, state, skb,
+ len, off);
goto end;
}
/* Hard errors break pipe and stop xmit. */
@@ -716,6 +734,11 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
skb_bpf_redirect_clear(skb);
sock_drop(psock->sk, skb);
}
+ kfree_skb(psock->work_state.skb);
+ /* We null the skb here to ensure that calls to sk_psock_backlog
+ * do not pick up the free'd skb.
+ */
+ psock->work_state.skb = NULL;
__sk_psock_purge_ingress_msg(psock);
}
@@ -767,8 +790,6 @@ static void sk_psock_destroy(struct work_struct *work)
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
- sk_psock_stop(psock, false);
-
write_lock_bh(&sk->sk_callback_lock);
sk_psock_restore_proto(sk, psock);
rcu_assign_sk_user_data(sk, NULL);
@@ -778,6 +799,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
+ sk_psock_stop(psock, false);
+
INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
queue_rcu_work(system_wq, &psock->rwork);
}
diff --git a/net/core/sock.c b/net/core/sock.c
index ba1c0f75cd45..62627e868e03 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,6 +139,8 @@
#include <net/tcp.h>
#include <net/busy_poll.h>
+#include <linux/ethtool.h>
+
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
@@ -224,6 +226,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
+ x "AF_MCTP" , \
x "AF_MAX"
static const char *const af_family_key_strings[AF_MAX+1] = {
@@ -810,8 +813,47 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
}
}
-int sock_set_timestamping(struct sock *sk, int optname, int val)
+static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev = NULL;
+ bool match = false;
+ int *vclock_index;
+ int i, num;
+
+ if (sk->sk_bound_dev_if)
+ dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+
+ if (!dev) {
+ pr_err("%s: sock not bind to device\n", __func__);
+ return -EOPNOTSUPP;
+ }
+
+ num = ethtool_get_phc_vclocks(dev, &vclock_index);
+ for (i = 0; i < num; i++) {
+ if (*(vclock_index + i) == phc_index) {
+ match = true;
+ break;
+ }
+ }
+
+ if (num > 0)
+ kfree(vclock_index);
+
+ if (!match)
+ return -EINVAL;
+
+ sk->sk_bind_phc = phc_index;
+
+ return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+ struct so_timestamping timestamping)
{
+ int val = timestamping.flags;
+ int ret;
+
if (val & ~SOF_TIMESTAMPING_MASK)
return -EINVAL;
@@ -832,6 +874,12 @@ int sock_set_timestamping(struct sock *sk, int optname, int val)
!(val & SOF_TIMESTAMPING_OPT_TSONLY))
return -EINVAL;
+ if (val & SOF_TIMESTAMPING_BIND_PHC) {
+ ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
+ if (ret)
+ return ret;
+ }
+
sk->sk_tsflags = val;
sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
@@ -907,6 +955,7 @@ EXPORT_SYMBOL(sock_set_mark);
int sock_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
+ struct so_timestamping timestamping;
struct sock_txtime sk_txtime;
struct sock *sk = sock->sk;
int val;
@@ -1068,12 +1117,22 @@ set_sndbuf:
case SO_TIMESTAMP_NEW:
case SO_TIMESTAMPNS_OLD:
case SO_TIMESTAMPNS_NEW:
- sock_set_timestamp(sk, valbool, optname);
+ sock_set_timestamp(sk, optname, valbool);
break;
case SO_TIMESTAMPING_NEW:
case SO_TIMESTAMPING_OLD:
- ret = sock_set_timestamping(sk, optname, val);
+ if (optlen == sizeof(timestamping)) {
+ if (copy_from_sockptr(&timestamping, optval,
+ sizeof(timestamping))) {
+ ret = -EFAULT;
+ break;
+ }
+ } else {
+ memset(&timestamping, 0, sizeof(timestamping));
+ timestamping.flags = val;
+ }
+ ret = sock_set_timestamping(sk, optname, timestamping);
break;
case SO_RCVLOWAT:
@@ -1201,7 +1260,7 @@ set_sndbuf:
if (val < 0)
ret = -EINVAL;
else
- sk->sk_ll_usec = val;
+ WRITE_ONCE(sk->sk_ll_usec, val);
}
break;
case SO_PREFER_BUSY_POLL:
@@ -1299,6 +1358,15 @@ set_sndbuf:
ret = sock_bindtoindex_locked(sk, val);
break;
+ case SO_BUF_LOCK:
+ if (val & ~SOCK_BUF_LOCK_MASK) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_userlocks = val | (sk->sk_userlocks &
+ ~SOCK_BUF_LOCK_MASK);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -1348,6 +1416,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
struct __kernel_old_timeval tm;
struct __kernel_sock_timeval stm;
struct sock_txtime txtime;
+ struct so_timestamping timestamping;
} v;
int lv = sizeof(int);
@@ -1451,7 +1520,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_TIMESTAMPING_OLD:
- v.val = sk->sk_tsflags;
+ lv = sizeof(v.timestamping);
+ v.timestamping.flags = sk->sk_tsflags;
+ v.timestamping.bind_phc = sk->sk_bind_phc;
break;
case SO_RCVTIMEO_OLD:
@@ -1658,6 +1729,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val64 = sock_net(sk)->net_cookie;
break;
+ case SO_BUF_LOCK:
+ v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -2499,7 +2574,6 @@ static void sk_leave_memory_pressure(struct sock *sk)
}
}
-#define SKB_FRAG_PAGE_ORDER get_order(32768)
DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
/**
@@ -2653,10 +2727,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
struct proto *prot = sk->sk_prot;
long allocated = sk_memory_allocated_add(sk, amt);
+ bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
bool charged = true;
- if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
+ if (memcg_charge &&
+ !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+ gfp_memcg_charge())))
goto suppress_allocation;
/* Under limit. */
@@ -2710,8 +2786,14 @@ suppress_allocation:
/* Fail only if socket is _under_ its sndbuf.
* In this case we cannot block, so that we have to fail.
*/
- if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
+ /* Force charge with __GFP_NOFAIL */
+ if (memcg_charge && !charged) {
+ mem_cgroup_charge_skmem(sk->sk_memcg, amt,
+ gfp_memcg_charge() | __GFP_NOFAIL);
+ }
return 1;
+ }
}
if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
@@ -2719,7 +2801,7 @@ suppress_allocation:
sk_memory_allocated_sub(sk, amt);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ if (memcg_charge && charged)
mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
return 0;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 60decd6420ca..e252b8ec2b85 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -211,8 +211,6 @@ out:
return psock;
}
-static bool sock_map_redirect_allowed(const struct sock *sk);
-
static int sock_map_link(struct bpf_map *map, struct sock *sk)
{
struct sk_psock_progs *progs = sock_map_progs(map);
@@ -223,13 +221,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
struct sk_psock *psock;
int ret;
- /* Only sockets we can redirect into/from in BPF need to hold
- * refs to parser/verdict progs and have their sk_data_ready
- * and sk_write_space callbacks overridden.
- */
- if (!sock_map_redirect_allowed(sk))
- goto no_progs;
-
stream_verdict = READ_ONCE(progs->stream_verdict);
if (stream_verdict) {
stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
@@ -264,7 +255,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
}
}
-no_progs:
psock = sock_map_psock_get_checked(sk);
if (IS_ERR(psock)) {
ret = PTR_ERR(psock);
@@ -527,12 +517,6 @@ static bool sk_is_tcp(const struct sock *sk)
sk->sk_protocol == IPPROTO_TCP;
}
-static bool sk_is_udp(const struct sock *sk)
-{
- return sk->sk_type == SOCK_DGRAM &&
- sk->sk_protocol == IPPROTO_UDP;
-}
-
static bool sock_map_redirect_allowed(const struct sock *sk)
{
if (sk_is_tcp(sk))
@@ -550,10 +534,7 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
{
if (sk_is_tcp(sk))
return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
- else if (sk_is_udp(sk))
- return sk_hashed(sk);
-
- return false;
+ return true;
}
static int sock_hash_update_common(struct bpf_map *map, void *key,
@@ -1513,6 +1494,7 @@ void sock_map_unhash(struct sock *sk)
rcu_read_unlock();
saved_unhash(sk);
}
+EXPORT_SYMBOL_GPL(sock_map_unhash);
void sock_map_close(struct sock *sk, long timeout)
{
@@ -1536,6 +1518,7 @@ void sock_map_close(struct sock *sk, long timeout)
release_sock(sk);
saved_close(sk, timeout);
}
+EXPORT_SYMBOL_GPL(sock_map_close);
static int sock_map_iter_attach_target(struct bpf_prog *prog,
union bpf_iter_link_info *linfo,