From 0b5c21bbc01e92745ca1ca4f6fd87d878fa3ea5e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 4 Apr 2022 11:38:47 +0200
Subject: net: ensure net_todo_list is processed quickly

In [1], Will raised a potential issue that the cfg80211 code,
which does (from a locking perspective)

  rtnl_lock()
  wiphy_lock()
  rtnl_unlock()

might be suspectible to ABBA deadlocks, because rtnl_unlock()
calls netdev_run_todo(), which might end up calling rtnl_lock()
again, which could then deadlock (see the comment in the code
added here for the scenario).

Some back and forth and thinking ensued, but clearly this can't
happen if the net_todo_list is empty at the rtnl_unlock() here.
Clearly, the code here cannot actually put an entry on it, and
all other users of rtnl_unlock() will empty it since that will
always go through netdev_run_todo(), emptying the list.

So the only other way to get there would be to add to the list
and then unlock the RTNL without going through rtnl_unlock(),
which is only possible through __rtnl_unlock(). However, this
isn't exported and not used in many places, and none of them
seem to be able to unregister before using it.

Therefore, add a WARN_ON() in the code to ensure this invariant
won't be broken, so that the cfg80211 (or any similar) code
stays safe.

[1] https://lore.kernel.org/r/Yjzpo3TfZxtKPMAG@google.com

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20220404113847.0ee02e4a70da.Ic73d206e217db20fd22dcec14fe5442ca732804b@changeid
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8c6c08446556..2ec17358d7b4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9431,7 +9431,7 @@ static int dev_new_index(struct net *net)
 }
 
 /* Delayed registration/unregisteration */
-static LIST_HEAD(net_todo_list);
+LIST_HEAD(net_todo_list);
 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 
 static void net_set_todo(struct net_device *dev)
-- 
cgit v1.2.3-59-g8ed1b


From a333215e10cb5d3b1e0685ca117f0e9452215485 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 5 Apr 2022 21:57:48 +0200
Subject: net: ethernet: mtk_eth_soc: implement flow offloading to WED devices

This allows hardware flow offloading from Ethernet to WLAN on MT7622 SoC

Co-developed-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_ppe.c         | 18 ++++++++
 drivers/net/ethernet/mediatek/mtk_ppe.h         | 14 ++++---
 drivers/net/ethernet/mediatek/mtk_ppe_offload.c | 56 ++++++++++++++++++++++++-
 drivers/net/ethernet/mediatek/mtk_wed.h         |  7 ++++
 include/linux/netdevice.h                       |  7 ++++
 net/core/dev.c                                  |  4 ++
 6 files changed, 98 insertions(+), 8 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c
index 3ad10c793308..472bcd3269a7 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe.c
@@ -329,6 +329,24 @@ int mtk_foe_entry_set_pppoe(struct mtk_foe_entry *entry, int sid)
 	return 0;
 }
 
+int mtk_foe_entry_set_wdma(struct mtk_foe_entry *entry, int wdma_idx, int txq,
+			   int bss, int wcid)
+{
+	struct mtk_foe_mac_info *l2 = mtk_foe_entry_l2(entry);
+	u32 *ib2 = mtk_foe_entry_ib2(entry);
+
+	*ib2 &= ~MTK_FOE_IB2_PORT_MG;
+	*ib2 |= MTK_FOE_IB2_WDMA_WINFO;
+	if (wdma_idx)
+		*ib2 |= MTK_FOE_IB2_WDMA_DEVIDX;
+
+	l2->vlan2 = FIELD_PREP(MTK_FOE_VLAN2_WINFO_BSS, bss) |
+		    FIELD_PREP(MTK_FOE_VLAN2_WINFO_WCID, wcid) |
+		    FIELD_PREP(MTK_FOE_VLAN2_WINFO_RING, txq);
+
+	return 0;
+}
+
 static inline bool mtk_foe_entry_usable(struct mtk_foe_entry *entry)
 {
 	return !(entry->ib1 & MTK_FOE_IB1_STATIC) &&
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h
index 242fb8f2ae65..df8ccaf48171 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe.h
+++ b/drivers/net/ethernet/mediatek/mtk_ppe.h
@@ -48,9 +48,9 @@ enum {
 #define MTK_FOE_IB2_DEST_PORT		GENMASK(7, 5)
 #define MTK_FOE_IB2_MULTICAST		BIT(8)
 
-#define MTK_FOE_IB2_WHNAT_QID2		GENMASK(13, 12)
-#define MTK_FOE_IB2_WHNAT_DEVIDX	BIT(16)
-#define MTK_FOE_IB2_WHNAT_NAT		BIT(17)
+#define MTK_FOE_IB2_WDMA_QID2		GENMASK(13, 12)
+#define MTK_FOE_IB2_WDMA_DEVIDX		BIT(16)
+#define MTK_FOE_IB2_WDMA_WINFO		BIT(17)
 
 #define MTK_FOE_IB2_PORT_MG		GENMASK(17, 12)
 
@@ -58,9 +58,9 @@ enum {
 
 #define MTK_FOE_IB2_DSCP		GENMASK(31, 24)
 
-#define MTK_FOE_VLAN2_WHNAT_BSS		GEMMASK(5, 0)
-#define MTK_FOE_VLAN2_WHNAT_WCID	GENMASK(13, 6)
-#define MTK_FOE_VLAN2_WHNAT_RING	GENMASK(15, 14)
+#define MTK_FOE_VLAN2_WINFO_BSS		GENMASK(5, 0)
+#define MTK_FOE_VLAN2_WINFO_WCID	GENMASK(13, 6)
+#define MTK_FOE_VLAN2_WINFO_RING	GENMASK(15, 14)
 
 enum {
 	MTK_FOE_STATE_INVALID,
@@ -281,6 +281,8 @@ int mtk_foe_entry_set_ipv6_tuple(struct mtk_foe_entry *entry,
 int mtk_foe_entry_set_dsa(struct mtk_foe_entry *entry, int port);
 int mtk_foe_entry_set_vlan(struct mtk_foe_entry *entry, int vid);
 int mtk_foe_entry_set_pppoe(struct mtk_foe_entry *entry, int sid);
+int mtk_foe_entry_set_wdma(struct mtk_foe_entry *entry, int wdma_idx, int txq,
+			   int bss, int wcid);
 int mtk_foe_entry_commit(struct mtk_ppe *ppe, struct mtk_foe_entry *entry,
 			 u16 timestamp);
 int mtk_ppe_debugfs_init(struct mtk_ppe *ppe);
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index 7bb1f20002b5..bcf342bb9051 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -10,6 +10,7 @@
 #include <net/pkt_cls.h>
 #include <net/dsa.h>
 #include "mtk_eth_soc.h"
+#include "mtk_wed.h"
 
 struct mtk_flow_data {
 	struct ethhdr eth;
@@ -39,6 +40,7 @@ struct mtk_flow_entry {
 	struct rhash_head node;
 	unsigned long cookie;
 	u16 hash;
+	s8 wed_index;
 };
 
 static const struct rhashtable_params mtk_flow_ht_params = {
@@ -80,6 +82,35 @@ mtk_flow_offload_mangle_eth(const struct flow_action_entry *act, void *eth)
 	memcpy(dest, src, act->mangle.mask ? 2 : 4);
 }
 
+static int
+mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_info *info)
+{
+	struct net_device_path_ctx ctx = {
+		.dev = dev,
+		.daddr = addr,
+	};
+	struct net_device_path path = {};
+
+	if (!IS_ENABLED(CONFIG_NET_MEDIATEK_SOC_WED))
+		return -1;
+
+	if (!dev->netdev_ops->ndo_fill_forward_path)
+		return -1;
+
+	if (dev->netdev_ops->ndo_fill_forward_path(&ctx, &path))
+		return -1;
+
+	if (path.type != DEV_PATH_MTK_WDMA)
+		return -1;
+
+	info->wdma_idx = path.mtk_wdma.wdma_idx;
+	info->queue = path.mtk_wdma.queue;
+	info->bss = path.mtk_wdma.bss;
+	info->wcid = path.mtk_wdma.wcid;
+
+	return 0;
+}
+
 
 static int
 mtk_flow_mangle_ports(const struct flow_action_entry *act,
@@ -149,10 +180,20 @@ mtk_flow_get_dsa_port(struct net_device **dev)
 
 static int
 mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe,
-			   struct net_device *dev)
+			   struct net_device *dev, const u8 *dest_mac,
+			   int *wed_index)
 {
+	struct mtk_wdma_info info = {};
 	int pse_port, dsa_port;
 
+	if (mtk_flow_get_wdma_info(dev, dest_mac, &info) == 0) {
+		mtk_foe_entry_set_wdma(foe, info.wdma_idx, info.queue, info.bss,
+				       info.wcid);
+		pse_port = 3;
+		*wed_index = info.wdma_idx;
+		goto out;
+	}
+
 	dsa_port = mtk_flow_get_dsa_port(&dev);
 	if (dsa_port >= 0)
 		mtk_foe_entry_set_dsa(foe, dsa_port);
@@ -164,6 +205,7 @@ mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe,
 	else
 		return -EOPNOTSUPP;
 
+out:
 	mtk_foe_entry_set_pse_port(foe, pse_port);
 
 	return 0;
@@ -179,6 +221,7 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f)
 	struct net_device *odev = NULL;
 	struct mtk_flow_entry *entry;
 	int offload_type = 0;
+	int wed_index = -1;
 	u16 addr_type = 0;
 	u32 timestamp;
 	u8 l4proto = 0;
@@ -326,10 +369,14 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f)
 	if (data.pppoe.num == 1)
 		mtk_foe_entry_set_pppoe(&foe, data.pppoe.sid);
 
-	err = mtk_flow_set_output_device(eth, &foe, odev);
+	err = mtk_flow_set_output_device(eth, &foe, odev, data.eth.h_dest,
+					 &wed_index);
 	if (err)
 		return err;
 
+	if (wed_index >= 0 && (err = mtk_wed_flow_add(wed_index)) < 0)
+		return err;
+
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 	if (!entry)
 		return -ENOMEM;
@@ -343,6 +390,7 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f)
 	}
 
 	entry->hash = hash;
+	entry->wed_index = wed_index;
 	err = rhashtable_insert_fast(&eth->flow_table, &entry->node,
 				     mtk_flow_ht_params);
 	if (err < 0)
@@ -353,6 +401,8 @@ clear_flow:
 	mtk_foe_entry_clear(&eth->ppe, hash);
 free:
 	kfree(entry);
+	if (wed_index >= 0)
+	    mtk_wed_flow_remove(wed_index);
 	return err;
 }
 
@@ -369,6 +419,8 @@ mtk_flow_offload_destroy(struct mtk_eth *eth, struct flow_cls_offload *f)
 	mtk_foe_entry_clear(&eth->ppe, entry->hash);
 	rhashtable_remove_fast(&eth->flow_table, &entry->node,
 			       mtk_flow_ht_params);
+	if (entry->wed_index >= 0)
+		mtk_wed_flow_remove(entry->wed_index);
 	kfree(entry);
 
 	return 0;
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.h b/drivers/net/ethernet/mediatek/mtk_wed.h
index 404c9a9b130d..981ec613f4b0 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed.h
@@ -7,6 +7,7 @@
 #include <linux/soc/mediatek/mtk_wed.h>
 #include <linux/debugfs.h>
 #include <linux/regmap.h>
+#include <linux/netdevice.h>
 
 struct mtk_eth;
 
@@ -27,6 +28,12 @@ struct mtk_wed_hw {
 	int index;
 };
 
+struct mtk_wdma_info {
+	u8 wdma_idx;
+	u8 queue;
+	u16 wcid;
+	u8 bss;
+};
 
 #ifdef CONFIG_NET_MEDIATEK_SOC_WED
 static inline void
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b6a1e7f643da..7b2a0b739684 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -862,6 +862,7 @@ enum net_device_path_type {
 	DEV_PATH_BRIDGE,
 	DEV_PATH_PPPOE,
 	DEV_PATH_DSA,
+	DEV_PATH_MTK_WDMA,
 };
 
 struct net_device_path {
@@ -887,6 +888,12 @@ struct net_device_path {
 			int port;
 			u16 proto;
 		} dsa;
+		struct {
+			u8 wdma_idx;
+			u8 queue;
+			u16 wcid;
+			u8 bss;
+		} mtk_wdma;
 	};
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 2ec17358d7b4..d5a362d53b34 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -701,6 +701,10 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
 		if (WARN_ON_ONCE(last_dev == ctx.dev))
 			return -1;
 	}
+
+	if (!ctx.dev)
+		return ret;
+
 	path = dev_fwd_path(stack);
 	if (!path)
 		return -1;
-- 
cgit v1.2.3-59-g8ed1b


From 2cc6cdd44a1655ac5a9863529a2fd6dbed2d092c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Apr 2022 14:37:53 -0700
Subject: net: unexport a handful of dev_* functions

We have a bunch of functions which are only used under
net/core/ yet they get exported. Remove the exports.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/dev.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index d5a362d53b34..8755ad71be6c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8645,7 +8645,6 @@ void dev_set_group(struct net_device *dev, int new_group)
 {
 	dev->group = new_group;
 }
-EXPORT_SYMBOL(dev_set_group);
 
 /**
  *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
@@ -8760,7 +8759,6 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier)
 		return -ENODEV;
 	return ops->ndo_change_carrier(dev, new_carrier);
 }
-EXPORT_SYMBOL(dev_change_carrier);
 
 /**
  *	dev_get_phys_port_id - Get device physical port ID
@@ -8778,7 +8776,6 @@ int dev_get_phys_port_id(struct net_device *dev,
 		return -EOPNOTSUPP;
 	return ops->ndo_get_phys_port_id(dev, ppid);
 }
-EXPORT_SYMBOL(dev_get_phys_port_id);
 
 /**
  *	dev_get_phys_port_name - Get device physical port name
@@ -8801,7 +8798,6 @@ int dev_get_phys_port_name(struct net_device *dev,
 	}
 	return devlink_compat_phys_port_name_get(dev, name, len);
 }
-EXPORT_SYMBOL(dev_get_phys_port_name);
 
 /**
  *	dev_get_port_parent_id - Get the device's port parent identifier
@@ -8883,7 +8879,6 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 	dev->proto_down = proto_down;
 	return 0;
 }
-EXPORT_SYMBOL(dev_change_proto_down);
 
 /**
  *	dev_change_proto_down_reason - proto down reason
@@ -8908,7 +8903,6 @@ void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
 		}
 	}
 }
-EXPORT_SYMBOL(dev_change_proto_down_reason);
 
 struct bpf_xdp_link {
 	struct bpf_link link;
-- 
cgit v1.2.3-59-g8ed1b


From 6264f58ca0e54e41d63c2d00334a48bac28fbf30 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 6 Apr 2022 14:37:54 -0700
Subject: net: extract a few internals from netdevice.h

There's a number of functions and static variables used
under net/core/ but not from the outside. We currently
dump most of them into netdevice.h. That bad for many
reasons:
 - netdevice.h is very cluttered, hard to figure out
   what the APIs are;
 - netdevice.h is very long;
 - we have to touch netdevice.h more which causes expensive
   incremental builds.

Create a header under net/core/ and move some declarations.

The new header is also a bit of a catch-all but that's
fine, if we create more specific headers people will
likely over-think where their declaration fit best.
And end up putting them in netdevice.h, again.

More work should be done on splitting netdevice.h into more
targeted headers, but that'd be more time consuming so small
steps.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h  | 72 +-----------------------------------
 net/core/dev.c             |  1 +
 net/core/dev.h             | 91 ++++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev_addr_lists.c  |  2 +
 net/core/dev_ioctl.c       |  2 +
 net/core/link_watch.c      |  1 +
 net/core/net-procfs.c      |  2 +
 net/core/net-sysfs.c       |  1 +
 net/core/rtnetlink.c       |  2 +
 net/core/sock.c            |  2 +
 net/core/sysctl_net_core.c |  2 +
 11 files changed, 108 insertions(+), 70 deletions(-)
 create mode 100644 net/core/dev.h

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7b2a0b739684..7e7b2a72e473 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -59,7 +59,8 @@ struct dsa_port;
 struct ip_tunnel_parm;
 struct macsec_context;
 struct macsec_ops;
-
+struct netdev_name_node;
+struct sd_flow_limit;
 struct sfp_bus;
 /* 802.11 specific */
 struct wireless_dev;
@@ -1020,16 +1021,6 @@ struct dev_ifalias {
 struct devlink;
 struct tlsdev_ops;
 
-struct netdev_name_node {
-	struct hlist_node hlist;
-	struct list_head list;
-	struct net_device *dev;
-	const char *name;
-};
-
-int netdev_name_node_alt_create(struct net_device *dev, const char *name);
-int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
-
 struct netdev_net_notifier {
 	struct list_head list;
 	struct notifier_block *nb;
@@ -2975,7 +2966,6 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
 struct net_device *dev_get_by_napi_id(unsigned int napi_id);
-int netdev_get_name(struct net *net, char *name, int ifindex);
 int dev_restart(struct net_device *dev);
 
 
@@ -3034,19 +3024,6 @@ static inline bool dev_has_header(const struct net_device *dev)
 	return dev->header_ops && dev->header_ops->create;
 }
 
-#ifdef CONFIG_NET_FLOW_LIMIT
-#define FLOW_LIMIT_HISTORY	(1 << 7)  /* must be ^2 and !overflow buckets */
-struct sd_flow_limit {
-	u64			count;
-	unsigned int		num_buckets;
-	unsigned int		history_head;
-	u16			history[FLOW_LIMIT_HISTORY];
-	u8			buckets[];
-};
-
-extern int netdev_flow_limit_table_len;
-#endif /* CONFIG_NET_FLOW_LIMIT */
-
 /*
  * Incoming packets are placed on per-CPU queues
  */
@@ -3770,7 +3747,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
 		     struct netlink_ext_ack *extack);
 void __dev_notify_flags(struct net_device *, unsigned int old_flags,
 			unsigned int gchanges);
-int dev_change_name(struct net_device *, const char *);
 int dev_set_alias(struct net_device *, const char *, size_t);
 int dev_get_alias(const struct net_device *, char *, size_t);
 int __dev_change_net_namespace(struct net_device *dev, struct net *net,
@@ -3782,13 +3758,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net,
 	return __dev_change_net_namespace(dev, net, pat, 0);
 }
 int __dev_set_mtu(struct net_device *, int);
-int dev_validate_mtu(struct net_device *dev, int mtu,
-		     struct netlink_ext_ack *extack);
-int dev_set_mtu_ext(struct net_device *dev, int mtu,
-		    struct netlink_ext_ack *extack);
 int dev_set_mtu(struct net_device *, int);
-int dev_change_tx_queue_len(struct net_device *, unsigned long);
-void dev_set_group(struct net_device *, int);
 int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 			      struct netlink_ext_ack *extack);
 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
@@ -3796,24 +3766,13 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
 int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
 			     struct netlink_ext_ack *extack);
 int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
-int dev_change_carrier(struct net_device *, bool new_carrier);
-int dev_get_phys_port_id(struct net_device *dev,
-			 struct netdev_phys_item_id *ppid);
-int dev_get_phys_port_name(struct net_device *dev,
-			   char *name, size_t len);
 int dev_get_port_parent_id(struct net_device *dev,
 			   struct netdev_phys_item_id *ppid, bool recurse);
 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
-int dev_change_proto_down(struct net_device *dev, bool proto_down);
-void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
-				  u32 value);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
 
-typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
-int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
-		      int fd, int expected_fd, u32 flags);
 int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 u8 dev_xdp_prog_count(struct net_device *dev);
 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
@@ -3898,13 +3857,6 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev,
 bool dev_nit_active(struct net_device *dev);
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
 
-extern int		netdev_budget;
-extern unsigned int	netdev_budget_usecs;
-
-/* Used by rtnetlink.c:__rtnl_unlock()/rtnl_unlock() */
-extern struct list_head net_todo_list;
-void netdev_run_todo(void);
-
 static inline void __dev_put(struct net_device *dev)
 {
 	if (dev) {
@@ -4021,10 +3973,7 @@ static inline void dev_replace_track(struct net_device *odev,
  * called netif_lowerlayer_*() because they represent the state of any
  * kind of lower layer not just hardware media.
  */
-
-void linkwatch_init_dev(struct net_device *dev);
 void linkwatch_fire_event(struct net_device *dev);
-void linkwatch_forget_dev(struct net_device *dev);
 
 /**
  *	netif_carrier_ok - test if carrier present
@@ -4470,9 +4419,6 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr,
 		 unsigned char addr_type);
 int dev_addr_del(struct net_device *dev, const unsigned char *addr,
 		 unsigned char addr_type);
-void dev_addr_flush(struct net_device *dev);
-int dev_addr_init(struct net_device *dev);
-void dev_addr_check(struct net_device *dev);
 
 /* Functions used for unicast addresses handling */
 int dev_uc_add(struct net_device *dev, const unsigned char *addr);
@@ -4562,7 +4508,6 @@ static inline void __dev_mc_unsync(struct net_device *dev,
 
 /* Functions used for secondary unicast and multicast support */
 void dev_set_rx_mode(struct net_device *dev);
-void __dev_set_rx_mode(struct net_device *dev);
 int dev_set_promiscuity(struct net_device *dev, int inc);
 int dev_set_allmulti(struct net_device *dev, int inc);
 void netdev_state_change(struct net_device *dev);
@@ -4580,11 +4525,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
 void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);
 
 extern int		netdev_max_backlog;
-extern int		netdev_tstamp_prequeue;
-extern int		netdev_unregister_timeout_secs;
-extern int		weight_p;
-extern int		dev_weight_rx_bias;
-extern int		dev_weight_tx_bias;
 extern int		dev_rx_weight;
 extern int		dev_tx_weight;
 extern int		gro_normal_batch;
@@ -4772,12 +4712,6 @@ static inline void netdev_rx_csum_fault(struct net_device *dev,
 void net_enable_timestamp(void);
 void net_disable_timestamp(void);
 
-#ifdef CONFIG_PROC_FS
-int __init dev_proc_init(void);
-#else
-#define dev_proc_init() 0
-#endif
-
 static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
 					      struct sk_buff *skb, struct net_device *dev,
 					      bool more)
@@ -4813,8 +4747,6 @@ extern const struct kobj_ns_type_operations net_ns_type_operations;
 
 const char *netdev_drivername(const struct net_device *dev);
 
-void linkwatch_run_queue(void);
-
 static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
 							  netdev_features_t f2)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 8755ad71be6c..f00d29856b43 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -151,6 +151,7 @@
 #include <linux/prandom.h>
 #include <linux/once_lite.h>
 
+#include "dev.h"
 #include "net-sysfs.h"
 
 
diff --git a/net/core/dev.h b/net/core/dev.h
new file mode 100644
index 000000000000..27923df00637
--- /dev/null
+++ b/net/core/dev.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_DEV_H
+#define _NET_CORE_DEV_H
+
+#include <linux/types.h>
+
+struct net;
+struct net_device;
+struct netdev_bpf;
+struct netdev_phys_item_id;
+struct netlink_ext_ack;
+
+/* Random bits of netdevice that don't need to be exposed */
+#define FLOW_LIMIT_HISTORY	(1 << 7)  /* must be ^2 and !overflow buckets */
+struct sd_flow_limit {
+	u64			count;
+	unsigned int		num_buckets;
+	unsigned int		history_head;
+	u16			history[FLOW_LIMIT_HISTORY];
+	u8			buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+
+#ifdef CONFIG_PROC_FS
+int __init dev_proc_init(void);
+#else
+#define dev_proc_init() 0
+#endif
+
+void linkwatch_init_dev(struct net_device *dev);
+void linkwatch_forget_dev(struct net_device *dev);
+void linkwatch_run_queue(void);
+
+void dev_addr_flush(struct net_device *dev);
+int dev_addr_init(struct net_device *dev);
+void dev_addr_check(struct net_device *dev);
+
+/* sysctls not referred to from outside net/core/ */
+extern int		netdev_budget;
+extern unsigned int	netdev_budget_usecs;
+
+extern int		netdev_tstamp_prequeue;
+extern int		netdev_unregister_timeout_secs;
+extern int		weight_p;
+extern int		dev_weight_rx_bias;
+extern int		dev_weight_tx_bias;
+
+/* rtnl helpers */
+extern struct list_head net_todo_list;
+void netdev_run_todo(void);
+
+/* netdev management, shared between various uAPI entry points */
+struct netdev_name_node {
+	struct hlist_node hlist;
+	struct list_head list;
+	struct net_device *dev;
+	const char *name;
+};
+
+int netdev_get_name(struct net *net, char *name, int ifindex);
+int dev_change_name(struct net_device *dev, const char *newname);
+
+int netdev_name_node_alt_create(struct net_device *dev, const char *name);
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
+
+int dev_validate_mtu(struct net_device *dev, int mtu,
+		     struct netlink_ext_ack *extack);
+int dev_set_mtu_ext(struct net_device *dev, int mtu,
+		    struct netlink_ext_ack *extack);
+
+int dev_get_phys_port_id(struct net_device *dev,
+			 struct netdev_phys_item_id *ppid);
+int dev_get_phys_port_name(struct net_device *dev,
+			   char *name, size_t len);
+
+int dev_change_proto_down(struct net_device *dev, bool proto_down);
+void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
+				  u32 value);
+
+typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+		      int fd, int expected_fd, u32 flags);
+
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
+void dev_set_group(struct net_device *dev, int new_group);
+int dev_change_carrier(struct net_device *dev, bool new_carrier);
+
+void __dev_set_rx_mode(struct net_device *dev);
+
+#endif
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index bead38ca50bd..baa63dee2829 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -12,6 +12,8 @@
 #include <linux/export.h>
 #include <linux/list.h>
 
+#include "dev.h"
+
 /*
  * General list handling functions
  */
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 1b807d119da5..4f6be442ae7e 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -10,6 +10,8 @@
 #include <net/dsa.h>
 #include <net/wext.h>
 
+#include "dev.h"
+
 /*
  *	Map an interface index to its name (SIOCGIFNAME)
  */
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 95098d1a49bd..a244d3bade7d 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -18,6 +18,7 @@
 #include <linux/bitops.h>
 #include <linux/types.h>
 
+#include "dev.h"
 
 enum lw_bits {
 	LW_URGENT = 0,
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 88cc0ad7d386..1ec23bf8b05c 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -4,6 +4,8 @@
 #include <linux/seq_file.h>
 #include <net/wext.h>
 
+#include "dev.h"
+
 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
 
 #define get_bucket(x) ((x) >> BUCKET_SPACE)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 9cbc1c8289bc..4980c3a50475 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -24,6 +24,7 @@
 #include <linux/of_net.h>
 #include <linux/cpu.h>
 
+#include "dev.h"
 #include "net-sysfs.h"
 
 #ifdef CONFIG_SYSFS
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 0e4502d641eb..4041b3e2e8ec 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -54,6 +54,8 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 
+#include "dev.h"
+
 #define RTNL_MAX_TYPE		50
 #define RTNL_SLAVE_MAX_TYPE	40
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 1180a0cb0110..7000403eaeb2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -141,6 +141,8 @@
 
 #include <linux/ethtool.h>
 
+#include "dev.h"
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7123fe7feeac..8295e5877eb3 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -23,6 +23,8 @@
 #include <net/busy_poll.h>
 #include <net/pkt_sched.h>
 
+#include "dev.h"
+
 static int two = 2;
 static int three = 3;
 static int int_3600 = 3600;
-- 
cgit v1.2.3-59-g8ed1b


From 794c24e9921f32ded4422833a990ccf11dc3c00e Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 6 Apr 2022 17:26:00 +0000
Subject: net-core: rx_otherhost_dropped to core_stats

Increment rx_otherhost_dropped counter when packet dropped due to
mismatched dest MAC addr.

An example when this drop can occur is when manually crafting raw
packets that will be consumed by a user space application via a tap
device. For testing purposes local traffic was generated using trafgen
for the client and netcat to start a server

Tested: Created 2 netns, sent 1 packet using trafgen from 1 to the other
with "{eth(daddr=$INCORRECT_MAC...}", verified that iproute2 showed the
counter was incremented. (Also had to modify iproute2 to show the stat,
additional patch for that coming next.)

Signed-off-by: Jeffrey Ji <jeffreyji@google.com>
Reviewed-by: Brian Vazquez <brianvv@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20220406172600.1141083-1-jeffreyjilinux@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h    | 2 ++
 include/uapi/linux/if_link.h | 5 +++++
 net/core/dev.c               | 1 +
 net/ipv4/ip_input.c          | 1 +
 net/ipv6/ip6_input.c         | 1 +
 5 files changed, 10 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7e7b2a72e473..28ea4f8269d4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -203,6 +203,7 @@ struct net_device_core_stats {
 	local_t		rx_dropped;
 	local_t		tx_dropped;
 	local_t		rx_nohandler;
+	local_t		rx_otherhost_dropped;
 } __aligned(4 * sizeof(local_t));
 
 #include <linux/cache.h>
@@ -3837,6 +3838,7 @@ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)		\
 DEV_CORE_STATS_INC(rx_dropped)
 DEV_CORE_STATS_INC(tx_dropped)
 DEV_CORE_STATS_INC(rx_nohandler)
+DEV_CORE_STATS_INC(rx_otherhost_dropped)
 
 static __always_inline int ____dev_forward_skb(struct net_device *dev,
 					       struct sk_buff *skb,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index cc284c048e69..d1e600816b82 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -211,6 +211,9 @@ struct rtnl_link_stats {
  * @rx_nohandler: Number of packets received on the interface
  *   but dropped by the networking stack because the device is
  *   not designated to receive packets (e.g. backup link in a bond).
+ *
+ * @rx_otherhost_dropped: Number of packets dropped due to mismatch
+ *   in destination MAC address.
  */
 struct rtnl_link_stats64 {
 	__u64	rx_packets;
@@ -243,6 +246,8 @@ struct rtnl_link_stats64 {
 	__u64	rx_compressed;
 	__u64	tx_compressed;
 	__u64	rx_nohandler;
+
+	__u64	rx_otherhost_dropped;
 };
 
 /* Subset of link stats useful for in-HW collection. Meaning of the fields is as
diff --git a/net/core/dev.c b/net/core/dev.c
index f00d29856b43..e027410e861b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10358,6 +10358,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 			storage->rx_dropped += local_read(&core_stats->rx_dropped);
 			storage->tx_dropped += local_read(&core_stats->tx_dropped);
 			storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
+			storage->rx_otherhost_dropped += local_read(&core_stats->rx_otherhost_dropped);
 		}
 	}
 	return storage;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 95f7bb052784..b1165f717cd1 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -451,6 +451,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 	 * that it receives, do not try to analyse it.
 	 */
 	if (skb->pkt_type == PACKET_OTHERHOST) {
+		dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
 		drop_reason = SKB_DROP_REASON_OTHERHOST;
 		goto drop;
 	}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 5b5ea35635f9..b4880c7c84eb 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -150,6 +150,7 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
 	struct inet6_dev *idev;
 
 	if (skb->pkt_type == PACKET_OTHERHOST) {
+		dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
 		kfree_skb(skb);
 		return NULL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 9f8ed577c28813410614b418bad42285840c1a00 Mon Sep 17 00:00:00 2001
From: Menglong Dong <imagedong@tencent.com>
Date: Thu, 7 Apr 2022 14:20:50 +0800
Subject: net: skb: rename SKB_DROP_REASON_PTYPE_ABSENT

As David Ahern suggested, the reasons for skb drops should be more
general and not be code based.

Therefore, rename SKB_DROP_REASON_PTYPE_ABSENT to
SKB_DROP_REASON_UNHANDLED_PROTO, which is used for the cases of no
L3 protocol handler, no L4 protocol handler, version extensions, etc.

From previous discussion, now we have the aim to make these reasons
more abstract and users based, avoiding code based.

Signed-off-by: Menglong Dong <imagedong@tencent.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h     | 8 +++-----
 include/trace/events/skb.h | 2 +-
 net/core/dev.c             | 8 +++-----
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2394441fa3dd..173bc35a10a3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -408,11 +408,9 @@ enum skb_drop_reason {
 					 */
 	SKB_DROP_REASON_XDP,		/* dropped by XDP in input path */
 	SKB_DROP_REASON_TC_INGRESS,	/* dropped in TC ingress HOOK */
-	SKB_DROP_REASON_PTYPE_ABSENT,	/* not packet_type found to handle
-					 * the skb. For an etner packet,
-					 * this means that L3 protocol is
-					 * not supported
-					 */
+	SKB_DROP_REASON_UNHANDLED_PROTO,	/* protocol not implemented
+						 * or not supported
+						 */
 	SKB_DROP_REASON_SKB_CSUM,	/* sk_buff checksum computation
 					 * error
 					 */
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index e1670e1e4934..85abd7cbd221 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -50,7 +50,7 @@
 	EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG)		\
 	EM(SKB_DROP_REASON_XDP, XDP)				\
 	EM(SKB_DROP_REASON_TC_INGRESS, TC_INGRESS)		\
-	EM(SKB_DROP_REASON_PTYPE_ABSENT, PTYPE_ABSENT)		\
+	EM(SKB_DROP_REASON_UNHANDLED_PROTO, UNHANDLED_PROTO)	\
 	EM(SKB_DROP_REASON_SKB_CSUM, SKB_CSUM)			\
 	EM(SKB_DROP_REASON_SKB_GSO_SEG, SKB_GSO_SEG)		\
 	EM(SKB_DROP_REASON_SKB_UCOPY_FAULT, SKB_UCOPY_FAULT)	\
diff --git a/net/core/dev.c b/net/core/dev.c
index e027410e861b..ba853e878007 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5375,13 +5375,11 @@ check_vlan_id:
 		*ppt_prev = pt_prev;
 	} else {
 drop:
-		if (!deliver_exact) {
+		if (!deliver_exact)
 			dev_core_stats_rx_dropped_inc(skb->dev);
-			kfree_skb_reason(skb, SKB_DROP_REASON_PTYPE_ABSENT);
-		} else {
+		else
 			dev_core_stats_rx_nohandler_inc(skb->dev);
-			kfree_skb(skb);
-		}
+		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
 		/* Jamal, now you will not able to escape explaining
 		 * me how you were going to use this. :-)
 		 */
-- 
cgit v1.2.3-59-g8ed1b


From 2f1e85b1aee459b7d0fd981839042c6a38ffaf0c Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Sat, 16 Apr 2022 00:40:45 +0800
Subject: net: sched: use queue_mapping to pick tx queue

This patch fixes issue:
* If we install tc filters with act_skbedit in clsact hook.
  It doesn't work, because netdev_core_pick_tx() overwrites
  queue_mapping.

  $ tc filter ... action skbedit queue_mapping 1

And this patch is useful:
* We can use FQ + EDT to implement efficient policies. Tx queues
  are picked by xps, ndo_select_queue of netdev driver, or skb hash
  in netdev_core_pick_tx(). In fact, the netdev driver, and skb
  hash are _not_ under control. xps uses the CPUs map to select Tx
  queues, but we can't figure out which task_struct of pod/containter
  running on this cpu in most case. We can use clsact filters to classify
  one pod/container traffic to one Tx queue. Why ?

  In containter networking environment, there are two kinds of pod/
  containter/net-namespace. One kind (e.g. P1, P2), the high throughput
  is key in these applications. But avoid running out of network resource,
  the outbound traffic of these pods is limited, using or sharing one
  dedicated Tx queues assigned HTB/TBF/FQ Qdisc. Other kind of pods
  (e.g. Pn), the low latency of data access is key. And the traffic is not
  limited. Pods use or share other dedicated Tx queues assigned FIFO Qdisc.
  This choice provides two benefits. First, contention on the HTB/FQ Qdisc
  lock is significantly reduced since fewer CPUs contend for the same queue.
  More importantly, Qdisc contention can be eliminated completely if each
  CPU has its own FIFO Qdisc for the second kind of pods.

  There must be a mechanism in place to support classifying traffic based on
  pods/container to different Tx queues. Note that clsact is outside of Qdisc
  while Qdisc can run a classifier to select a sub-queue under the lock.

  In general recording the decision in the skb seems a little heavy handed.
  This patch introduces a per-CPU variable, suggested by Eric.

  The xmit.skip_txqueue flag is firstly cleared in __dev_queue_xmit().
  - Tx Qdisc may install that skbedit actions, then xmit.skip_txqueue flag
    is set in qdisc->enqueue() though tx queue has been selected in
    netdev_tx_queue_mapping() or netdev_core_pick_tx(). That flag is cleared
    firstly in __dev_queue_xmit(), is useful:
  - Avoid picking Tx queue with netdev_tx_queue_mapping() in next netdev
    in such case: eth0 macvlan - eth0.3 vlan - eth0 ixgbe-phy:
    For example, eth0, macvlan in pod, which root Qdisc install skbedit
    queue_mapping, send packets to eth0.3, vlan in host. In __dev_queue_xmit() of
    eth0.3, clear the flag, does not select tx queue according to skb->queue_mapping
    because there is no filters in clsact or tx Qdisc of this netdev.
    Same action taked in eth0, ixgbe in Host.
  - Avoid picking Tx queue for next packet. If we set xmit.skip_txqueue
    in tx Qdisc (qdisc->enqueue()), the proper way to clear it is clearing it
    in __dev_queue_xmit when processing next packets.

  For performance reasons, use the static key. If user does not config the NET_EGRESS,
  the patch will not be compiled.

  +----+      +----+      +----+
  | P1 |      | P2 |      | Pn |
  +----+      +----+      +----+
    |           |           |
    +-----------+-----------+
                |
                | clsact/skbedit
                |      MQ
                v
    +-----------+-----------+
    | q0        | q1        | qn
    v           v           v
  HTB/FQ      HTB/FQ  ...  FIFO

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jonathan Lemon <jonathan.lemon@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Talal Ahmad <talalahmad@google.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Antoine Tenart <atenart@kernel.org>
Cc: Wei Wang <weiwan@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h |  3 +++
 include/linux/rtnetlink.h |  1 +
 net/core/dev.c            | 31 +++++++++++++++++++++++++++++--
 net/sched/act_skbedit.c   |  6 +++++-
 4 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a602f29365b0..7dccbfd1bf56 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3061,6 +3061,9 @@ struct softnet_data {
 	struct {
 		u16 recursion;
 		u8  more;
+#ifdef CONFIG_NET_EGRESS
+		u8  skip_txqueue;
+#endif
 	} xmit;
 #ifdef CONFIG_RPS
 	/* input_queue_head should be written by cpu owning this struct,
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 7f970b16da3a..ae2c6a3cec5d 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -100,6 +100,7 @@ void net_dec_ingress_queue(void);
 #ifdef CONFIG_NET_EGRESS
 void net_inc_egress_queue(void);
 void net_dec_egress_queue(void);
+void netdev_xmit_skip_txqueue(bool skip);
 #endif
 
 void rtnetlink_init(void);
diff --git a/net/core/dev.c b/net/core/dev.c
index ba853e878007..4a77ebda4fb1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3925,6 +3925,25 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 
 	return skb;
 }
+
+static struct netdev_queue *
+netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
+{
+	int qm = skb_get_queue_mapping(skb);
+
+	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
+}
+
+static bool netdev_xmit_txqueue_skipped(void)
+{
+	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
+{
+	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
 #endif /* CONFIG_NET_EGRESS */
 
 #ifdef CONFIG_XPS
@@ -4095,7 +4114,7 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 {
 	struct net_device *dev = skb->dev;
-	struct netdev_queue *txq;
+	struct netdev_queue *txq = NULL;
 	struct Qdisc *q;
 	int rc = -ENOMEM;
 	bool again = false;
@@ -4123,11 +4142,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 			if (!skb)
 				goto out;
 		}
+
+		netdev_xmit_skip_txqueue(false);
+
 		nf_skip_egress(skb, true);
 		skb = sch_handle_egress(skb, &rc, dev);
 		if (!skb)
 			goto out;
 		nf_skip_egress(skb, false);
+
+		if (netdev_xmit_txqueue_skipped())
+			txq = netdev_tx_queue_mapping(dev, skb);
 	}
 #endif
 	/* If device/qdisc don't need skb->dst, release it right now while
@@ -4138,7 +4163,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	else
 		skb_dst_force(skb);
 
-	txq = netdev_core_pick_tx(dev, skb, sb_dev);
+	if (!txq)
+		txq = netdev_core_pick_tx(dev, skb, sb_dev);
+
 	q = rcu_dereference_bh(txq->qdisc);
 
 	trace_net_dev_queue(skb);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 92d0dc754207..1c5fdb6e7c2f 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -58,8 +58,12 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a,
 		}
 	}
 	if (params->flags & SKBEDIT_F_QUEUE_MAPPING &&
-	    skb->dev->real_num_tx_queues > params->queue_mapping)
+	    skb->dev->real_num_tx_queues > params->queue_mapping) {
+#ifdef CONFIG_NET_EGRESS
+		netdev_xmit_skip_txqueue(true);
+#endif
 		skb_set_queue_mapping(skb, params->queue_mapping);
+	}
 	if (params->flags & SKBEDIT_F_MARK) {
 		skb->mark &= ~params->mask;
 		skb->mark |= params->mark & params->mask;
-- 
cgit v1.2.3-59-g8ed1b


From 68822bdf76f10c3dc80609d4e2cdc1e847429086 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Apr 2022 13:12:37 -0700
Subject: net: generalize skb freeing deferral to per-cpu lists

Logic added in commit f35f821935d8 ("tcp: defer skb freeing after socket
lock is released") helped bulk TCP flows to move the cost of skbs
frees outside of critical section where socket lock was held.

But for RPC traffic, or hosts with RFS enabled, the solution is far from
being ideal.

For RPC traffic, recvmsg() has to return to user space right after
skb payload has been consumed, meaning that BH handler has no chance
to pick the skb before recvmsg() thread. This issue is more visible
with BIG TCP, as more RPC fit one skb.

For RFS, even if BH handler picks the skbs, they are still picked
from the cpu on which user thread is running.

Ideally, it is better to free the skbs (and associated page frags)
on the cpu that originally allocated them.

This patch removes the per socket anchor (sk->defer_list) and
instead uses a per-cpu list, which will hold more skbs per round.

This new per-cpu list is drained at the end of net_action_rx(),
after incoming packets have been processed, to lower latencies.

In normal conditions, skbs are added to the per-cpu list with
no further action. In the (unlikely) cases where the cpu does not
run net_action_rx() handler fast enough, we use an IPI to raise
NET_RX_SOFTIRQ on the remote cpu.

Also, we do not bother draining the per-cpu list from dev_cpu_dead()
This is because skbs in this list have no requirement on how fast
they should be freed.

Note that we can add in the future a small per-cpu cache
if we see any contention on sd->defer_lock.

Tested on a pair of hosts with 100Gbit NIC, RFS enabled,
and /proc/sys/net/ipv4/tcp_rmem[2] tuned to 16MB to work around
page recycling strategy used by NIC driver (its page pool capacity
being too small compared to number of skbs/pages held in sockets
receive queues)

Note that this tuning was only done to demonstrate worse
conditions for skb freeing for this particular test.
These conditions can happen in more general production workload.

10 runs of one TCP_STREAM flow

Before:
Average throughput: 49685 Mbit.

Kernel profiles on cpu running user thread recvmsg() show high cost for
skb freeing related functions (*)

    57.81%  [kernel]       [k] copy_user_enhanced_fast_string
(*) 12.87%  [kernel]       [k] skb_release_data
(*)  4.25%  [kernel]       [k] __free_one_page
(*)  3.57%  [kernel]       [k] __list_del_entry_valid
     1.85%  [kernel]       [k] __netif_receive_skb_core
     1.60%  [kernel]       [k] __skb_datagram_iter
(*)  1.59%  [kernel]       [k] free_unref_page_commit
(*)  1.16%  [kernel]       [k] __slab_free
     1.16%  [kernel]       [k] _copy_to_iter
(*)  1.01%  [kernel]       [k] kfree
(*)  0.88%  [kernel]       [k] free_unref_page
     0.57%  [kernel]       [k] ip6_rcv_core
     0.55%  [kernel]       [k] ip6t_do_table
     0.54%  [kernel]       [k] flush_smp_call_function_queue
(*)  0.54%  [kernel]       [k] free_pcppages_bulk
     0.51%  [kernel]       [k] llist_reverse_order
     0.38%  [kernel]       [k] process_backlog
(*)  0.38%  [kernel]       [k] free_pcp_prepare
     0.37%  [kernel]       [k] tcp_recvmsg_locked
(*)  0.37%  [kernel]       [k] __list_add_valid
     0.34%  [kernel]       [k] sock_rfree
     0.34%  [kernel]       [k] _raw_spin_lock_irq
(*)  0.33%  [kernel]       [k] __page_cache_release
     0.33%  [kernel]       [k] tcp_v6_rcv
(*)  0.33%  [kernel]       [k] __put_page
(*)  0.29%  [kernel]       [k] __mod_zone_page_state
     0.27%  [kernel]       [k] _raw_spin_lock

After patch:
Average throughput: 73076 Mbit.

Kernel profiles on cpu running user thread recvmsg() looks better:

    81.35%  [kernel]       [k] copy_user_enhanced_fast_string
     1.95%  [kernel]       [k] _copy_to_iter
     1.95%  [kernel]       [k] __skb_datagram_iter
     1.27%  [kernel]       [k] __netif_receive_skb_core
     1.03%  [kernel]       [k] ip6t_do_table
     0.60%  [kernel]       [k] sock_rfree
     0.50%  [kernel]       [k] tcp_v6_rcv
     0.47%  [kernel]       [k] ip6_rcv_core
     0.45%  [kernel]       [k] read_tsc
     0.44%  [kernel]       [k] _raw_spin_lock_irqsave
     0.37%  [kernel]       [k] _raw_spin_lock
     0.37%  [kernel]       [k] native_irq_return_iret
     0.33%  [kernel]       [k] __inet6_lookup_established
     0.31%  [kernel]       [k] ip6_protocol_deliver_rcu
     0.29%  [kernel]       [k] tcp_rcv_established
     0.29%  [kernel]       [k] llist_reverse_order

v2: kdoc issue (kernel bots)
    do not defer if (alloc_cpu == smp_processor_id()) (Paolo)
    replace the sk_buff_head with a single-linked list (Jakub)
    add a READ_ONCE()/WRITE_ONCE() for the lockless read of sd->defer_list

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20220422201237.416238-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  5 +++++
 include/linux/skbuff.h    |  3 +++
 include/net/sock.h        |  2 --
 include/net/tcp.h         | 12 -----------
 net/core/dev.c            | 31 ++++++++++++++++++++++++++++
 net/core/skbuff.c         | 51 ++++++++++++++++++++++++++++++++++++++++++++++-
 net/core/sock.c           |  3 ---
 net/ipv4/tcp.c            | 25 +----------------------
 net/ipv4/tcp_ipv4.c       |  1 -
 net/ipv6/tcp_ipv6.c       |  1 -
 net/tls/tls_sw.c          |  2 --
 11 files changed, 90 insertions(+), 46 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7dccbfd1bf56..ac8a5f71220a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3081,6 +3081,11 @@ struct softnet_data {
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 
+	/* Another possibly contended cache line */
+	spinlock_t		defer_lock ____cacheline_aligned_in_smp;
+	int			defer_count;
+	struct sk_buff		*defer_list;
+	call_single_data_t	defer_csd;
 };
 
 static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 84d78df60453..5cbc184ca685 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -888,6 +888,7 @@ typedef unsigned char *sk_buff_data_t;
  *		delivery_time at egress.
  *	@napi_id: id of the NAPI struct this skb came from
  *	@sender_cpu: (aka @napi_id) source CPU in XPS
+ *	@alloc_cpu: CPU which did the skb allocation.
  *	@secmark: security marking
  *	@mark: Generic packet mark
  *	@reserved_tailroom: (aka @mark) number of bytes of free space available
@@ -1080,6 +1081,7 @@ struct sk_buff {
 		unsigned int	sender_cpu;
 	};
 #endif
+	u16			alloc_cpu;
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32		secmark;
 #endif
@@ -1321,6 +1323,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size);
 struct sk_buff *build_skb(void *data, unsigned int frag_size);
 struct sk_buff *build_skb_around(struct sk_buff *skb,
 				 void *data, unsigned int frag_size);
+void skb_attempt_defer_free(struct sk_buff *skb);
 
 struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index a01d6c421aa2..f9f8ecae0f8d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -292,7 +292,6 @@ struct sk_filter;
   *	@sk_pacing_shift: scaling factor for TCP Small Queues
   *	@sk_lingertime: %SO_LINGER l_linger setting
   *	@sk_backlog: always used with the per-socket spinlock held
-  *	@defer_list: head of llist storing skbs to be freed
   *	@sk_callback_lock: used with the callbacks in the end of this struct
   *	@sk_error_queue: rarely used
   *	@sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
@@ -417,7 +416,6 @@ struct sock {
 		struct sk_buff	*head;
 		struct sk_buff	*tail;
 	} sk_backlog;
-	struct llist_head defer_list;
 
 #define sk_rmem_alloc sk_backlog.rmem_alloc
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 679b1964d494..94a52ad1101c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1375,18 +1375,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
 		     enum skb_drop_reason *reason);
 
-#ifdef CONFIG_INET
-void __sk_defer_free_flush(struct sock *sk);
-
-static inline void sk_defer_free_flush(struct sock *sk)
-{
-	if (llist_empty(&sk->defer_list))
-		return;
-	__sk_defer_free_flush(sk);
-}
-#else
-static inline void sk_defer_free_flush(struct sock *sk) {}
-#endif
 
 int tcp_filter(struct sock *sk, struct sk_buff *skb);
 void tcp_set_state(struct sock *sk, int state);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4a77ebda4fb1..611bd7197064 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4545,6 +4545,12 @@ static void rps_trigger_softirq(void *data)
 
 #endif /* CONFIG_RPS */
 
+/* Called from hardirq (IPI) context */
+static void trigger_rx_softirq(void *data __always_unused)
+{
+	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+
 /*
  * Check if this softnet_data structure is another cpu one
  * If yes, queue it to our IPI list and return 1
@@ -6571,6 +6577,28 @@ static int napi_threaded_poll(void *data)
 	return 0;
 }
 
+static void skb_defer_free_flush(struct softnet_data *sd)
+{
+	struct sk_buff *skb, *next;
+	unsigned long flags;
+
+	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
+	if (!READ_ONCE(sd->defer_list))
+		return;
+
+	spin_lock_irqsave(&sd->defer_lock, flags);
+	skb = sd->defer_list;
+	sd->defer_list = NULL;
+	sd->defer_count = 0;
+	spin_unlock_irqrestore(&sd->defer_lock, flags);
+
+	while (skb != NULL) {
+		next = skb->next;
+		__kfree_skb(skb);
+		skb = next;
+	}
+}
+
 static __latent_entropy void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -6616,6 +6644,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 	net_rps_action_and_irq_enable(sd);
+	skb_defer_free_flush(sd);
 }
 
 struct netdev_adjacent {
@@ -11326,6 +11355,8 @@ static int __init net_dev_init(void)
 		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
 		sd->cpu = i;
 #endif
+		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL);
+		spin_lock_init(&sd->defer_lock);
 
 		init_gro_hash(&sd->backlog);
 		sd->backlog.poll = process_backlog;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 30b523fa4ad2..028a280fbabd 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -204,7 +204,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data,
 	skb_set_end_offset(skb, size);
 	skb->mac_header = (typeof(skb->mac_header))~0U;
 	skb->transport_header = (typeof(skb->transport_header))~0U;
-
+	skb->alloc_cpu = raw_smp_processor_id();
 	/* make sure we initialize shinfo sequentially */
 	shinfo = skb_shinfo(skb);
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
@@ -1037,6 +1037,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	CHECK_SKB_FIELD(napi_id);
 #endif
+	CHECK_SKB_FIELD(alloc_cpu);
 #ifdef CONFIG_XPS
 	CHECK_SKB_FIELD(sender_cpu);
 #endif
@@ -6486,3 +6487,51 @@ free_now:
 }
 EXPORT_SYMBOL(__skb_ext_put);
 #endif /* CONFIG_SKB_EXTENSIONS */
+
+/**
+ * skb_attempt_defer_free - queue skb for remote freeing
+ * @skb: buffer
+ *
+ * Put @skb in a per-cpu list, using the cpu which
+ * allocated the skb/pages to reduce false sharing
+ * and memory zone spinlock contention.
+ */
+void skb_attempt_defer_free(struct sk_buff *skb)
+{
+	int cpu = skb->alloc_cpu;
+	struct softnet_data *sd;
+	unsigned long flags;
+	bool kick;
+
+	if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+	    !cpu_online(cpu) ||
+	    cpu == raw_smp_processor_id()) {
+		__kfree_skb(skb);
+		return;
+	}
+
+	sd = &per_cpu(softnet_data, cpu);
+	/* We do not send an IPI or any signal.
+	 * Remote cpu will eventually call skb_defer_free_flush()
+	 */
+	spin_lock_irqsave(&sd->defer_lock, flags);
+	skb->next = sd->defer_list;
+	/* Paired with READ_ONCE() in skb_defer_free_flush() */
+	WRITE_ONCE(sd->defer_list, skb);
+	sd->defer_count++;
+
+	/* kick every time queue length reaches 128.
+	 * This should avoid blocking in smp_call_function_single_async().
+	 * This condition should hardly be bit under normal conditions,
+	 * unless cpu suddenly stopped to receive NIC interrupts.
+	 */
+	kick = sd->defer_count == 128;
+
+	spin_unlock_irqrestore(&sd->defer_lock, flags);
+
+	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
+	 * if we are unlucky enough (this seems very unlikely).
+	 */
+	if (unlikely(kick))
+		smp_call_function_single_async(cpu, &sd->defer_csd);
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index 29abec3eabd8..a0f3989de3d6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2082,9 +2082,6 @@ void sk_destruct(struct sock *sk)
 {
 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
 
-	WARN_ON_ONCE(!llist_empty(&sk->defer_list));
-	sk_defer_free_flush(sk);
-
 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
 		reuseport_detach_sock(sk);
 		use_call_rcu = true;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e20b87b3bf90..db55af9eb37b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -843,7 +843,6 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 	}
 
 	release_sock(sk);
-	sk_defer_free_flush(sk);
 
 	if (spliced)
 		return spliced;
@@ -1589,20 +1588,6 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
 		tcp_send_ack(sk);
 }
 
-void __sk_defer_free_flush(struct sock *sk)
-{
-	struct llist_node *head;
-	struct sk_buff *skb, *n;
-
-	head = llist_del_all(&sk->defer_list);
-	llist_for_each_entry_safe(skb, n, head, ll_node) {
-		prefetch(n);
-		skb_mark_not_on_list(skb);
-		__kfree_skb(skb);
-	}
-}
-EXPORT_SYMBOL(__sk_defer_free_flush);
-
 static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
@@ -1610,11 +1595,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
 		sock_rfree(skb);
 		skb->destructor = NULL;
 		skb->sk = NULL;
-		if (!skb_queue_empty(&sk->sk_receive_queue) ||
-		    !llist_empty(&sk->defer_list)) {
-			llist_add(&skb->ll_node, &sk->defer_list);
-			return;
-		}
+		return skb_attempt_defer_free(skb);
 	}
 	__kfree_skb(skb);
 }
@@ -2453,7 +2434,6 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
 			__sk_flush_backlog(sk);
 		} else {
 			tcp_cleanup_rbuf(sk, copied);
-			sk_defer_free_flush(sk);
 			sk_wait_data(sk, &timeo, last);
 		}
 
@@ -2571,7 +2551,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
 	lock_sock(sk);
 	ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
 	release_sock(sk);
-	sk_defer_free_flush(sk);
 
 	if (cmsg_flags && ret >= 0) {
 		if (cmsg_flags & TCP_CMSG_TS)
@@ -3096,7 +3075,6 @@ int tcp_disconnect(struct sock *sk, int flags)
 		sk->sk_frag.page = NULL;
 		sk->sk_frag.offset = 0;
 	}
-	sk_defer_free_flush(sk);
 	sk_error_report(sk);
 	return 0;
 }
@@ -4225,7 +4203,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 		err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
 							  &zc, &len, err);
 		release_sock(sk);
-		sk_defer_free_flush(sk);
 		if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
 			goto zerocopy_rcv_cmsg;
 		switch (len) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2c2d42142555..918816ec5dd4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2065,7 +2065,6 @@ process:
 
 	sk_incoming_cpu_update(sk);
 
-	sk_defer_free_flush(sk);
 	bh_lock_sock_nested(sk);
 	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 54277de7474b..60bdec257ba7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1728,7 +1728,6 @@ process:
 
 	sk_incoming_cpu_update(sk);
 
-	sk_defer_free_flush(sk);
 	bh_lock_sock_nested(sk);
 	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index ddbe05ec5489..bc54f6c5b1a4 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1911,7 +1911,6 @@ recv_end:
 
 end:
 	release_sock(sk);
-	sk_defer_free_flush(sk);
 	if (psock)
 		sk_psock_put(sk, psock);
 	return copied ? : err;
@@ -1983,7 +1982,6 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 
 splice_read_end:
 	release_sock(sk);
-	sk_defer_free_flush(sk);
 	return copied ? : err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From f3412b3879b4f7c4313b186b03940d4791345534 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Apr 2022 13:41:47 -0700
Subject: net: make sure net_rx_action() calls skb_defer_free_flush()

I missed a stray return; in net_rx_action(), which very well
is taken whenever trigger_rx_softirq() has been called on
a cpu that is no longer receiving network packets,
or receiving too few of them.

Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/20220427204147.1310161-1-eric.dumazet@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 611bd7197064..e09cd202fc57 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6617,7 +6617,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 
 		if (list_empty(&list)) {
 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
-				return;
+				goto end;
 			break;
 		}
 
@@ -6644,6 +6644,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 	net_rps_action_and_irq_enable(sd);
+end:
 	skb_defer_free_flush(sd);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c526fd8f9f4f21cb83c0b1c9a1ee9c0ac9be9e2e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 28 Apr 2022 11:58:46 +0100
Subject: net: inline dev_queue_xmit()

Inline dev_queue_xmit() and dev_queue_xmit_accel(), they both are small
proxy functions doing nothing but redirecting the control flow to
__dev_queue_xmit().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 ++++++++++++--
 net/core/dev.c            | 15 ++-------------
 2 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b75ca2d095ae..4aba92a4042a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2940,10 +2940,20 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 		       struct net_device *sb_dev);
 
-int dev_queue_xmit(struct sk_buff *skb);
-int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
+int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev);
 int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
 
+static inline int dev_queue_xmit(struct sk_buff *skb)
+{
+	return __dev_queue_xmit(skb, NULL);
+}
+
+static inline int dev_queue_xmit_accel(struct sk_buff *skb,
+				       struct net_device *sb_dev)
+{
+	return __dev_queue_xmit(skb, sb_dev);
+}
+
 static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 {
 	int ret;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7a61690d3137..d127164771f2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4111,7 +4111,7 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
  *      the BH enable code must have IRQs enabled so that it will not deadlock.
  *          --BLG
  */
-static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
+int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 {
 	struct net_device *dev = skb->dev;
 	struct netdev_queue *txq = NULL;
@@ -4235,18 +4235,7 @@ out:
 	rcu_read_unlock_bh();
 	return rc;
 }
-
-int dev_queue_xmit(struct sk_buff *skb)
-{
-	return __dev_queue_xmit(skb, NULL);
-}
-EXPORT_SYMBOL(dev_queue_xmit);
-
-int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
-{
-	return __dev_queue_xmit(skb, sb_dev);
-}
-EXPORT_SYMBOL(dev_queue_xmit_accel);
+EXPORT_SYMBOL(__dev_queue_xmit);
 
 int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 58caed3dacb4354a25a1aa8d2febc3e9648ba1f4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 2 May 2022 16:27:03 -0700
Subject: netdev: reshuffle netif_napi_add() APIs to allow dropping weight

Most drivers should not have to worry about selecting the right
weight for their NAPI instances and pass NAPI_POLL_WEIGHT.
It'd be best if we didn't require the argument at all and selected
the default internally.

This change prepares the ground for such reshuffling, allowing
for a smooth transition. The following API should remain after
the next release cycle:
  netif_napi_add()
  netif_napi_add_weight()
  netif_napi_add_tx()
  netif_napi_add_tx_weight()
Where the _weight() variants take an explicit weight argument.
I opted for a _weight() suffix rather than a __ prefix, because
we use __ in places to mean that caller needs to also issue a
synchronize_net() call.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20220502232703.396351-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 50 +++++++++++++++++++++++++++++++----------------
 net/core/dev.c            |  6 +++---
 2 files changed, 36 insertions(+), 20 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4aba92a4042a..eaf66e57d891 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2499,37 +2499,53 @@ static inline void *netdev_priv(const struct net_device *dev)
  */
 #define NAPI_POLL_WEIGHT 64
 
+void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
+			   int (*poll)(struct napi_struct *, int), int weight);
+
 /**
- *	netif_napi_add - initialize a NAPI context
- *	@dev:  network device
- *	@napi: NAPI context
- *	@poll: polling function
- *	@weight: default weight
+ * netif_napi_add() - initialize a NAPI context
+ * @dev:  network device
+ * @napi: NAPI context
+ * @poll: polling function
+ * @weight: default weight
  *
  * netif_napi_add() must be used to initialize a NAPI context prior to calling
  * *any* of the other NAPI-related functions.
  */
-void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
-		    int (*poll)(struct napi_struct *, int), int weight);
+static inline void
+netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+	       int (*poll)(struct napi_struct *, int), int weight)
+{
+	netif_napi_add_weight(dev, napi, poll, weight);
+}
+
+static inline void
+netif_napi_add_tx_weight(struct net_device *dev,
+			 struct napi_struct *napi,
+			 int (*poll)(struct napi_struct *, int),
+			 int weight)
+{
+	set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state);
+	netif_napi_add_weight(dev, napi, poll, weight);
+}
+
+#define netif_tx_napi_add netif_napi_add_tx_weight
 
 /**
- *	netif_tx_napi_add - initialize a NAPI context
- *	@dev:  network device
- *	@napi: NAPI context
- *	@poll: polling function
- *	@weight: default weight
+ * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
+ * @dev:  network device
+ * @napi: NAPI context
+ * @poll: polling function
  *
  * This variant of netif_napi_add() should be used from drivers using NAPI
  * to exclusively poll a TX queue.
  * This will avoid we add it into napi_hash[], thus polluting this hash table.
  */
-static inline void netif_tx_napi_add(struct net_device *dev,
+static inline void netif_napi_add_tx(struct net_device *dev,
 				     struct napi_struct *napi,
-				     int (*poll)(struct napi_struct *, int),
-				     int weight)
+				     int (*poll)(struct napi_struct *, int))
 {
-	set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state);
-	netif_napi_add(dev, napi, poll, weight);
+	netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
 }
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index d127164771f2..c2d73595a7c3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6303,8 +6303,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
 }
 EXPORT_SYMBOL(dev_set_threaded);
 
-void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
-		    int (*poll)(struct napi_struct *, int), int weight)
+void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
+			   int (*poll)(struct napi_struct *, int), int weight)
 {
 	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
 		return;
@@ -6337,7 +6337,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	if (dev->threaded && napi_kthread_create(napi))
 		dev->threaded = 0;
 }
-EXPORT_SYMBOL(netif_napi_add);
+EXPORT_SYMBOL(netif_napi_add_weight);
 
 void napi_disable(struct napi_struct *n)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 6df6398f7c8b481ce83f28143bc08a5231616deb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 5 May 2022 19:51:31 -0700
Subject: net: add netif_inherit_tso_max()

To make later patches smaller create a helper for inheriting
the TSO limitations of a lower device. The TSO in the name
is not an accident, subsequent patches will replace GSO
with TSO in more names.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c |  3 +--
 drivers/net/ipvlan/ipvlan_main.c                  |  6 ++----
 drivers/net/macvlan.c                             |  6 ++----
 drivers/net/veth.c                                |  3 +--
 drivers/net/vxlan/vxlan_core.c                    |  3 +--
 include/linux/netdevice.h                         |  3 +++
 net/8021q/vlan.c                                  |  3 +--
 net/8021q/vlan_dev.c                              |  3 +--
 net/core/dev.c                                    | 12 ++++++++++++
 9 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index ba3fa7eac98d..790e1d5e4b4a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -286,8 +286,7 @@ nfp_repr_transfer_features(struct net_device *netdev, struct net_device *lower)
 	if (repr->dst->u.port_info.lower_dev != lower)
 		return;
 
-	netif_set_gso_max_size(netdev, lower->gso_max_size);
-	netif_set_gso_max_segs(netdev, lower->gso_max_segs);
+	netif_inherit_tso_max(netdev, lower);
 
 	netdev_update_features(netdev);
 }
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 696e245f6d00..aa28a29e228c 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -139,8 +139,7 @@ static int ipvlan_init(struct net_device *dev)
 	dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES;
 	dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS;
 	dev->hw_enc_features |= dev->features;
-	netif_set_gso_max_size(dev, phy_dev->gso_max_size);
-	netif_set_gso_max_segs(dev, phy_dev->gso_max_segs);
+	netif_inherit_tso_max(dev, phy_dev);
 	dev->hard_header_len = phy_dev->hard_header_len;
 
 	netdev_lockdep_set_classes(dev);
@@ -762,8 +761,7 @@ static int ipvlan_device_event(struct notifier_block *unused,
 
 	case NETDEV_FEAT_CHANGE:
 		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
-			netif_set_gso_max_size(ipvlan->dev, dev->gso_max_size);
-			netif_set_gso_max_segs(ipvlan->dev, dev->gso_max_segs);
+			netif_inherit_tso_max(ipvlan->dev, dev);
 			netdev_update_features(ipvlan->dev);
 		}
 		break;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index b00bc8173abe..0017bd0fe3bb 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -904,8 +904,7 @@ static int macvlan_init(struct net_device *dev)
 	dev->vlan_features	= lowerdev->vlan_features & MACVLAN_FEATURES;
 	dev->vlan_features	|= ALWAYS_ON_OFFLOADS;
 	dev->hw_enc_features    |= dev->features;
-	netif_set_gso_max_size(dev, lowerdev->gso_max_size);
-	netif_set_gso_max_segs(dev, lowerdev->gso_max_segs);
+	netif_inherit_tso_max(dev, lowerdev);
 	dev->hard_header_len	= lowerdev->hard_header_len;
 	macvlan_set_lockdep_class(dev);
 
@@ -1763,8 +1762,7 @@ static int macvlan_device_event(struct notifier_block *unused,
 		break;
 	case NETDEV_FEAT_CHANGE:
 		list_for_each_entry(vlan, &port->vlans, list) {
-			netif_set_gso_max_size(vlan->dev, dev->gso_max_size);
-			netif_set_gso_max_segs(vlan->dev, dev->gso_max_segs);
+			netif_inherit_tso_max(vlan->dev, dev);
 			netdev_update_features(vlan->dev);
 		}
 		break;
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 3592014e50cc..f474e79a7745 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1758,8 +1758,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 	if (ifmp && (dev->ifindex != 0))
 		peer->ifindex = ifmp->ifi_index;
 
-	netif_set_gso_max_size(peer, dev->gso_max_size);
-	netif_set_gso_max_segs(peer, dev->gso_max_segs);
+	netif_inherit_tso_max(peer, dev);
 
 	err = register_netdevice(peer);
 	put_net(net);
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 8a5e3a6d32d7..2bda692f70c5 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -3683,8 +3683,7 @@ static void vxlan_config_apply(struct net_device *dev,
 	if (lowerdev) {
 		dst->remote_ifindex = conf->remote_ifindex;
 
-		netif_set_gso_max_size(dev, lowerdev->gso_max_size);
-		netif_set_gso_max_segs(dev, lowerdev->gso_max_segs);
+		netif_inherit_tso_max(dev, lowerdev);
 
 		needed_headroom = lowerdev->hard_header_len;
 		needed_headroom += lowerdev->needed_headroom;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eaf66e57d891..006bb5c0e413 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4895,6 +4895,9 @@ static inline void netif_set_gro_max_size(struct net_device *dev,
 	WRITE_ONCE(dev->gro_max_size, size);
 }
 
+void netif_inherit_tso_max(struct net_device *to,
+			   const struct net_device *from);
+
 static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol,
 					int pulled_hlen, u16 mac_offset,
 					int mac_len)
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 788076b002b3..e40aa3e3641c 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -319,8 +319,7 @@ static void vlan_transfer_features(struct net_device *dev,
 {
 	struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
 
-	netif_set_gso_max_size(vlandev, dev->gso_max_size);
-	netif_set_gso_max_segs(vlandev, dev->gso_max_segs);
+	netif_inherit_tso_max(vlandev, dev);
 
 	if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
 		vlandev->hard_header_len = dev->hard_header_len;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e5d23e75572a..839f2020b015 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -573,8 +573,7 @@ static int vlan_dev_init(struct net_device *dev)
 			   NETIF_F_ALL_FCOE;
 
 	dev->features |= dev->hw_features | NETIF_F_LLTX;
-	netif_set_gso_max_size(dev, real_dev->gso_max_size);
-	netif_set_gso_max_segs(dev, real_dev->gso_max_segs);
+	netif_inherit_tso_max(dev, real_dev);
 	if (dev->features & NETIF_F_VLAN_FEATURES)
 		netdev_warn(real_dev, "VLAN features are set incorrectly.  Q-in-Q configurations may not work correctly.\n");
 
diff --git a/net/core/dev.c b/net/core/dev.c
index c2d73595a7c3..eef1d19b130f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2992,6 +2992,18 @@ undo_rx:
 }
 EXPORT_SYMBOL(netif_set_real_num_queues);
 
+/**
+ * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
+ * @to:		netdev to update
+ * @from:	netdev from which to copy the limits
+ */
+void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
+{
+	netif_set_gso_max_size(to, from->gso_max_size);
+	netif_set_gso_max_segs(to, from->gso_max_segs);
+}
+EXPORT_SYMBOL(netif_inherit_tso_max);
+
 /**
  * netif_get_num_default_rss_queues - default number of RSS queues
  *
-- 
cgit v1.2.3-59-g8ed1b


From 14d7b8122fd591693a2388b98563707ba72c6780 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 5 May 2022 19:51:32 -0700
Subject: net: don't allow user space to lift the device limits

Up until commit 46e6b992c250 ("rtnetlink: allow GSO maximums to
be set on device creation") the gso_max_segs and gso_max_size
of a device were not controlled from user space.

The quoted commit added the ability to control them because of
the following setup:

 netns A  |  netns B
     veth<->veth   eth0

If eth0 has TSO limitations and user wants to efficiently forward
traffic between eth0 and the veths they should copy the TSO
limitations of eth0 onto the veths. This would happen automatically
for macvlans or ipvlan but veth users are not so lucky (given the
loose coupling).

Unfortunately the commit in question allowed users to also override
the limits on real HW devices.

It may be useful to control the max GSO size and someone may be using
that ability (not that I know of any user), so create a separate set
of knobs to reliably record the TSO limitations. Validate the user
requests.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 +++++++++
 net/core/dev.c            | 35 +++++++++++++++++++++++++++++++++++
 net/core/rtnetlink.c      |  4 ++--
 3 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 006bb5c0e413..e12f7de6d6ae 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1917,8 +1917,10 @@ enum netdev_ml_priv_type {
  *	@rtnl_link_ops:	Rtnl_link_ops
  *
  *	@gso_max_size:	Maximum size of generic segmentation offload
+ *	@tso_max_size:	Device (as in HW) limit on the max TSO request size
  *	@gso_max_segs:	Maximum number of segments that can be passed to the
  *			NIC for GSO
+ *	@tso_max_segs:	Device (as in HW) limit on the max TSO segment count
  *
  *	@dcbnl_ops:	Data Center Bridging netlink ops
  *	@num_tc:	Number of traffic classes in the net device
@@ -2262,8 +2264,13 @@ struct net_device {
 	/* for setting kernel sock attribute on TCP connection setup */
 #define GSO_MAX_SIZE		65536
 	unsigned int		gso_max_size;
+#define TSO_LEGACY_MAX_SIZE	65536
+#define TSO_MAX_SIZE		UINT_MAX
+	unsigned int		tso_max_size;
 #define GSO_MAX_SEGS		65535
 	u16			gso_max_segs;
+#define TSO_MAX_SEGS		U16_MAX
+	u16			tso_max_segs;
 
 #ifdef CONFIG_DCB
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
@@ -4895,6 +4902,8 @@ static inline void netif_set_gro_max_size(struct net_device *dev,
 	WRITE_ONCE(dev->gro_max_size, size);
 }
 
+void netif_set_tso_max_size(struct net_device *dev, unsigned int size);
+void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs);
 void netif_inherit_tso_max(struct net_device *to,
 			   const struct net_device *from);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index eef1d19b130f..6ae085b11373 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2992,6 +2992,39 @@ undo_rx:
 }
 EXPORT_SYMBOL(netif_set_real_num_queues);
 
+/**
+ * netif_set_tso_max_size() - set the max size of TSO frames supported
+ * @dev:	netdev to update
+ * @size:	max skb->len of a TSO frame
+ *
+ * Set the limit on the size of TSO super-frames the device can handle.
+ * Unless explicitly set the stack will assume the value of %GSO_MAX_SIZE.
+ */
+void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
+{
+	dev->tso_max_size = size;
+	if (size < READ_ONCE(dev->gso_max_size))
+		netif_set_gso_max_size(dev, size);
+}
+EXPORT_SYMBOL(netif_set_tso_max_size);
+
+/**
+ * netif_set_tso_max_segs() - set the max number of segs supported for TSO
+ * @dev:	netdev to update
+ * @segs:	max number of TCP segments
+ *
+ * Set the limit on the number of TCP segments the device can generate from
+ * a single TSO super-frame.
+ * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
+ */
+void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
+{
+	dev->tso_max_segs = segs;
+	if (segs < READ_ONCE(dev->gso_max_segs))
+		netif_set_gso_max_segs(dev, segs);
+}
+EXPORT_SYMBOL(netif_set_tso_max_segs);
+
 /**
  * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
  * @to:		netdev to update
@@ -10572,6 +10605,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	dev->gso_max_size = GSO_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
 	dev->gro_max_size = GRO_MAX_SIZE;
+	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
+	dev->tso_max_segs = TSO_MAX_SEGS;
 	dev->upper_level = 1;
 	dev->lower_level = 1;
 #ifdef CONFIG_LOCKDEP
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index eea5ed09e1bb..e6d4b9272995 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2803,7 +2803,7 @@ static int do_setlink(const struct sk_buff *skb,
 	if (tb[IFLA_GSO_MAX_SIZE]) {
 		u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
 
-		if (max_size > GSO_MAX_SIZE) {
+		if (max_size > GSO_MAX_SIZE || max_size > dev->tso_max_size) {
 			err = -EINVAL;
 			goto errout;
 		}
@@ -2817,7 +2817,7 @@ static int do_setlink(const struct sk_buff *skb,
 	if (tb[IFLA_GSO_MAX_SEGS]) {
 		u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
 
-		if (max_segs > GSO_MAX_SEGS) {
+		if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) {
 			err = -EINVAL;
 			goto errout;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From ee8b7a1156f357613646d6c69d07ac5a087a1071 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 5 May 2022 19:51:33 -0700
Subject: net: make drivers set the TSO limit not the GSO limit

Drivers should call the TSO setting helper, GSO is controllable
by user space.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                      | 12 ++++++------
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c      |  2 +-
 drivers/net/ethernet/cavium/liquidio/lio_main.c      |  2 +-
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c   |  2 +-
 drivers/net/ethernet/emulex/benet/be_main.c          |  2 +-
 drivers/net/ethernet/freescale/fec_main.c            |  2 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c        |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c        |  4 ++--
 drivers/net/ethernet/marvell/mv643xx_eth.c           |  2 +-
 drivers/net/ethernet/marvell/mvneta.c                |  2 +-
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c      |  2 +-
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c |  2 +-
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c |  2 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c  |  2 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c    |  2 +-
 drivers/net/ethernet/realtek/r8169_main.c            |  8 ++++----
 drivers/net/ethernet/sfc/ef100_nic.c                 |  9 ++++++---
 drivers/net/ethernet/sfc/efx.c                       |  2 +-
 drivers/net/ethernet/sfc/falcon/efx.c                |  2 +-
 drivers/net/hyperv/rndis_filter.c                    |  2 +-
 drivers/net/usb/aqc111.c                             |  2 +-
 drivers/net/usb/ax88179_178a.c                       |  2 +-
 drivers/net/usb/lan78xx.c                            |  2 +-
 drivers/net/usb/r8152.c                              |  2 +-
 drivers/s390/net/qeth_l2_main.c                      |  2 +-
 drivers/s390/net/qeth_l3_main.c                      |  2 +-
 net/bridge/br_if.c                                   | 12 ++++++------
 net/core/dev.c                                       |  4 ++--
 28 files changed, 49 insertions(+), 46 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c9e75a9de282..f245e439f44c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1419,8 +1419,8 @@ static void bond_compute_features(struct bonding *bond)
 	struct list_head *iter;
 	struct slave *slave;
 	unsigned short max_hard_header_len = ETH_HLEN;
-	unsigned int gso_max_size = GSO_MAX_SIZE;
-	u16 gso_max_segs = GSO_MAX_SEGS;
+	unsigned int tso_max_size = TSO_MAX_SIZE;
+	u16 tso_max_segs = TSO_MAX_SEGS;
 
 	if (!bond_has_slaves(bond))
 		goto done;
@@ -1449,8 +1449,8 @@ static void bond_compute_features(struct bonding *bond)
 		if (slave->dev->hard_header_len > max_hard_header_len)
 			max_hard_header_len = slave->dev->hard_header_len;
 
-		gso_max_size = min(gso_max_size, slave->dev->gso_max_size);
-		gso_max_segs = min(gso_max_segs, slave->dev->gso_max_segs);
+		tso_max_size = min(tso_max_size, slave->dev->tso_max_size);
+		tso_max_segs = min(tso_max_segs, slave->dev->tso_max_segs);
 	}
 	bond_dev->hard_header_len = max_hard_header_len;
 
@@ -1463,8 +1463,8 @@ done:
 	bond_dev->hw_enc_features |= xfrm_features;
 #endif /* CONFIG_XFRM_OFFLOAD */
 	bond_dev->mpls_features = mpls_features;
-	netif_set_gso_max_segs(bond_dev, gso_max_segs);
-	netif_set_gso_max_size(bond_dev, gso_max_size);
+	netif_set_tso_max_segs(bond_dev, tso_max_segs);
+	netif_set_tso_max_size(bond_dev, tso_max_size);
 
 	bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 	if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 56e5f440e666..20681860a599 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -2395,7 +2395,7 @@ static int atl1e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	INIT_WORK(&adapter->reset_task, atl1e_reset_task);
 	INIT_WORK(&adapter->link_chg_task, atl1e_link_chg_task);
-	netif_set_gso_max_size(netdev, MAX_TSO_SEG_SIZE);
+	netif_set_tso_max_size(netdev, MAX_TSO_SEG_SIZE);
 	err = register_netdev(netdev);
 	if (err) {
 		netdev_err(netdev, "register netdevice failed\n");
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index ba28aa444e5a..bee35ce60171 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -3563,7 +3563,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 					      | NETIF_F_TSO | NETIF_F_TSO6
 					      | NETIF_F_LRO;
 		}
-		netif_set_gso_max_size(netdev, OCTNIC_GSO_MAX_SIZE);
+		netif_set_tso_max_size(netdev, OCTNIC_GSO_MAX_SIZE);
 
 		/*  Copy of transmit encapsulation capabilities:
 		 *  TSO, TSO6, Checksums for this device
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 568f211d91cc..ac196883f07e 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -2094,7 +2094,7 @@ static int setup_nic_devices(struct octeon_device *octeon_dev)
 				      | NETIF_F_TSO | NETIF_F_TSO6
 				      | NETIF_F_GRO
 				      | NETIF_F_LRO;
-		netif_set_gso_max_size(netdev, OCTNIC_GSO_MAX_SIZE);
+		netif_set_tso_max_size(netdev, OCTNIC_GSO_MAX_SIZE);
 
 		/* Copy of transmit encapsulation capabilities:
 		 * TSO, TSO6, Checksums for this device
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 5939068a8f62..cd4e243da5fa 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -5204,7 +5204,7 @@ static void be_netdev_init(struct net_device *netdev)
 
 	netdev->flags |= IFF_MULTICAST;
 
-	netif_set_gso_max_size(netdev, BE_MAX_GSO_SIZE - ETH_HLEN);
+	netif_set_tso_max_size(netdev, BE_MAX_GSO_SIZE - ETH_HLEN);
 
 	netdev->netdev_ops = &be_netdev_ops;
 
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 9f33ec838b52..6e52f3ad182f 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3566,7 +3566,7 @@ static int fec_enet_init(struct net_device *ndev)
 		ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;
 
 	if (fep->quirks & FEC_QUIRK_HAS_CSUM) {
-		netif_set_gso_max_segs(ndev, FEC_MAX_TSO_SEGS);
+		netif_set_tso_max_segs(ndev, FEC_MAX_TSO_SEGS);
 
 		/* enable hw accelerator */
 		ndev->features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index 22a463e15678..2f0bd21a9082 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -1782,7 +1782,7 @@ static int hns_nic_set_features(struct net_device *netdev,
 			priv->ops.fill_desc = fill_tso_desc;
 			priv->ops.maybe_stop_tx = hns_nic_maybe_stop_tso;
 			/* The chip only support 7*4096 */
-			netif_set_gso_max_size(netdev, 7 * 4096);
+			netif_set_tso_max_size(netdev, 7 * 4096);
 		} else {
 			priv->ops.fill_desc = fill_v2_desc;
 			priv->ops.maybe_stop_tx = hns_nic_maybe_stop_tx;
@@ -2168,7 +2168,7 @@ static void hns_nic_set_priv_ops(struct net_device *netdev)
 			priv->ops.fill_desc = fill_tso_desc;
 			priv->ops.maybe_stop_tx = hns_nic_maybe_stop_tso;
 			/* This chip only support 7*4096 */
-			netif_set_gso_max_size(netdev, 7 * 4096);
+			netif_set_tso_max_size(netdev, 7 * 4096);
 		} else {
 			priv->ops.fill_desc = fill_v2_desc;
 			priv->ops.maybe_stop_tx = hns_nic_maybe_stop_tx;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c4a4954aa317..292a025e5423 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5051,12 +5051,12 @@ static void ixgbe_configure_dcb(struct ixgbe_adapter *adapter)
 
 	if (!(adapter->flags & IXGBE_FLAG_DCB_ENABLED)) {
 		if (hw->mac.type == ixgbe_mac_82598EB)
-			netif_set_gso_max_size(adapter->netdev, 65536);
+			netif_set_tso_max_size(adapter->netdev, 65536);
 		return;
 	}
 
 	if (hw->mac.type == ixgbe_mac_82598EB)
-		netif_set_gso_max_size(adapter->netdev, 32768);
+		netif_set_tso_max_size(adapter->netdev, 32768);
 
 #ifdef IXGBE_FCOE
 	if (adapter->netdev->features & NETIF_F_FCOE_MTU)
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index c18801490649..57eff4e9e6de 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -3207,7 +3207,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 	dev->hw_features = dev->features;
 
 	dev->priv_flags |= IFF_UNICAST_FLT;
-	netif_set_gso_max_segs(dev, MV643XX_MAX_TSO_SEGS);
+	netif_set_tso_max_segs(dev, MV643XX_MAX_TSO_SEGS);
 
 	/* MTU range: 64 - 9500 */
 	dev->min_mtu = 64;
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index f6a54c7f0c69..384f5a16753d 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -5617,7 +5617,7 @@ static int mvneta_probe(struct platform_device *pdev)
 	dev->hw_features |= dev->features;
 	dev->vlan_features |= dev->features;
 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
-	netif_set_gso_max_segs(dev, MVNETA_MAX_TSO_SEGS);
+	netif_set_tso_max_segs(dev, MVNETA_MAX_TSO_SEGS);
 
 	/* MTU range: 68 - 9676 */
 	dev->min_mtu = ETH_MIN_MTU;
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 1a835b48791b..2b7eade373be 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -6861,7 +6861,7 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 		mvpp2_set_hw_csum(port, port->pool_long->id);
 
 	dev->vlan_features |= features;
-	netif_set_gso_max_segs(dev, MVPP2_MAX_TSO_SEGS);
+	netif_set_tso_max_segs(dev, MVPP2_MAX_TSO_SEGS);
 	dev->priv_flags |= IFF_UNICAST_FLT;
 
 	/* MTU range: 68 - 9704 */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index 441aafc26a08..53b2706d65a1 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -2704,7 +2704,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	netdev->hw_features |= NETIF_F_LOOPBACK | NETIF_F_RXALL;
 
-	netif_set_gso_max_segs(netdev, OTX2_MAX_GSO_SEGS);
+	netif_set_tso_max_segs(netdev, OTX2_MAX_GSO_SEGS);
 	netdev->watchdog_timeo = OTX2_TX_TIMEOUT;
 
 	netdev->netdev_ops = &otx2_netdev_ops;
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
index 9e87836ed8bf..86653bb8e403 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
@@ -652,7 +652,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	netdev->hw_features |= NETIF_F_RXALL;
 	netdev->hw_features |= NETIF_F_HW_TC;
 
-	netif_set_gso_max_segs(netdev, OTX2_MAX_GSO_SEGS);
+	netif_set_tso_max_segs(netdev, OTX2_MAX_GSO_SEGS);
 	netdev->watchdog_timeo = OTX2_TX_TIMEOUT;
 
 	netdev->netdev_ops = &otx2vf_netdev_ops;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 5528d12d1f48..c60ead337d06 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2320,7 +2320,7 @@ static void nfp_net_netdev_init(struct nfp_net *nn)
 	netdev->min_mtu = ETH_MIN_MTU;
 	netdev->max_mtu = nn->max_mtu;
 
-	netif_set_gso_max_segs(netdev, NFP_NET_LSO_MAX_SEGS);
+	netif_set_tso_max_segs(netdev, NFP_NET_LSO_MAX_SEGS);
 
 	netif_carrier_off(netdev);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 790e1d5e4b4a..75b5018f2e1b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -380,7 +380,7 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev,
 
 	/* Advertise but disable TSO by default. */
 	netdev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
-	netif_set_gso_max_segs(netdev, NFP_NET_LSO_MAX_SEGS);
+	netif_set_tso_max_segs(netdev, NFP_NET_LSO_MAX_SEGS);
 
 	netdev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL;
 	netdev->features |= NETIF_F_LLTX;
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 33f5c5698ccb..3098d6672192 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5442,12 +5442,12 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	 */
 	if (rtl_chip_supports_csum_v2(tp)) {
 		dev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6;
-		netif_set_gso_max_size(dev, RTL_GSO_MAX_SIZE_V2);
-		netif_set_gso_max_segs(dev, RTL_GSO_MAX_SEGS_V2);
+		netif_set_tso_max_size(dev, RTL_GSO_MAX_SIZE_V2);
+		netif_set_tso_max_segs(dev, RTL_GSO_MAX_SEGS_V2);
 	} else {
 		dev->hw_features |= NETIF_F_SG | NETIF_F_TSO;
-		netif_set_gso_max_size(dev, RTL_GSO_MAX_SIZE_V1);
-		netif_set_gso_max_segs(dev, RTL_GSO_MAX_SEGS_V1);
+		netif_set_tso_max_size(dev, RTL_GSO_MAX_SIZE_V1);
+		netif_set_tso_max_segs(dev, RTL_GSO_MAX_SEGS_V1);
 	}
 
 	dev->hw_features |= NETIF_F_RXALL;
diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c
index b04911bc8c57..a69d756e09b9 100644
--- a/drivers/net/ethernet/sfc/ef100_nic.c
+++ b/drivers/net/ethernet/sfc/ef100_nic.c
@@ -1009,11 +1009,13 @@ static int ef100_process_design_param(struct efx_nic *efx,
 		return 0;
 	case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_LEN:
 		nic_data->tso_max_payload_len = min_t(u64, reader->value, GSO_MAX_SIZE);
-		netif_set_gso_max_size(efx->net_dev, nic_data->tso_max_payload_len);
+		netif_set_tso_max_size(efx->net_dev,
+				       nic_data->tso_max_payload_len);
 		return 0;
 	case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_NUM_SEGS:
 		nic_data->tso_max_payload_num_segs = min_t(u64, reader->value, 0xffff);
-		netif_set_gso_max_segs(efx->net_dev, nic_data->tso_max_payload_num_segs);
+		netif_set_tso_max_segs(efx->net_dev,
+				       nic_data->tso_max_payload_num_segs);
 		return 0;
 	case ESE_EF100_DP_GZ_TSO_MAX_NUM_FRAMES:
 		nic_data->tso_max_frames = min_t(u64, reader->value, 0xffff);
@@ -1138,7 +1140,8 @@ static int ef100_probe_main(struct efx_nic *efx)
 	nic_data->tso_max_frames = ESE_EF100_DP_GZ_TSO_MAX_NUM_FRAMES_DEFAULT;
 	nic_data->tso_max_payload_num_segs = ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_NUM_SEGS_DEFAULT;
 	nic_data->tso_max_payload_len = ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_LEN_DEFAULT;
-	netif_set_gso_max_segs(net_dev, ESE_EF100_DP_GZ_TSO_MAX_HDR_NUM_SEGS_DEFAULT);
+	netif_set_tso_max_segs(net_dev,
+			       ESE_EF100_DP_GZ_TSO_MAX_HDR_NUM_SEGS_DEFAULT);
 	/* Read design parameters */
 	rc = ef100_check_design_params(efx);
 	if (rc) {
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 5e7fe75cb1d4..5a772354da83 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -710,7 +710,7 @@ static int efx_register_netdev(struct efx_nic *efx)
 	if (efx_nic_rev(efx) >= EFX_REV_HUNT_A0)
 		net_dev->priv_flags |= IFF_UNICAST_FLT;
 	net_dev->ethtool_ops = &efx_ethtool_ops;
-	netif_set_gso_max_segs(net_dev, EFX_TSO_MAX_SEGS);
+	netif_set_tso_max_segs(net_dev, EFX_TSO_MAX_SEGS);
 	net_dev->min_mtu = EFX_MIN_MTU;
 	net_dev->max_mtu = EFX_MAX_MTU;
 
diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c
index 60c595ef7589..b7282331faec 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.c
+++ b/drivers/net/ethernet/sfc/falcon/efx.c
@@ -2267,7 +2267,7 @@ static int ef4_register_netdev(struct ef4_nic *efx)
 	net_dev->irq = efx->pci_dev->irq;
 	net_dev->netdev_ops = &ef4_netdev_ops;
 	net_dev->ethtool_ops = &ef4_ethtool_ops;
-	netif_set_gso_max_segs(net_dev, EF4_TSO_MAX_SEGS);
+	netif_set_tso_max_segs(net_dev, EF4_TSO_MAX_SEGS);
 	net_dev->min_mtu = EF4_MIN_MTU;
 	net_dev->max_mtu = EF4_MAX_MTU;
 
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 448fcc325ed7..866af2cc27a3 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1431,7 +1431,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device,
 	 */
 	net->features &= ~NETVSC_SUPPORTED_HW_FEATURES | net->hw_features;
 
-	netif_set_gso_max_size(net, gso_max_size);
+	netif_set_tso_max_size(net, gso_max_size);
 
 	ret = rndis_filter_set_offload_params(net, nvdev, &offloads);
 
diff --git a/drivers/net/usb/aqc111.c b/drivers/net/usb/aqc111.c
index ca409d450a29..3020e81159d0 100644
--- a/drivers/net/usb/aqc111.c
+++ b/drivers/net/usb/aqc111.c
@@ -735,7 +735,7 @@ static int aqc111_bind(struct usbnet *dev, struct usb_interface *intf)
 	dev->net->features |= AQ_SUPPORT_FEATURE;
 	dev->net->vlan_features |= AQ_SUPPORT_VLAN_FEATURE;
 
-	netif_set_gso_max_size(dev->net, 65535);
+	netif_set_tso_max_size(dev->net, 65535);
 
 	aqc111_read_fw_version(dev, aqc111_data);
 	aqc111_data->autoneg = AUTONEG_ENABLE;
diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index e2fa56b92685..7a8c11a26eb5 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -1382,7 +1382,7 @@ static int ax88179_bind(struct usbnet *dev, struct usb_interface *intf)
 
 	dev->net->hw_features |= dev->net->features;
 
-	netif_set_gso_max_size(dev->net, 16384);
+	netif_set_tso_max_size(dev->net, 16384);
 
 	/* Enable checksum offload */
 	*tmp = AX_RXCOE_IP | AX_RXCOE_TCP | AX_RXCOE_UDP |
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 94e571fb61da..636a405844c5 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -4372,7 +4372,7 @@ static int lan78xx_probe(struct usb_interface *intf,
 	/* MTU range: 68 - 9000 */
 	netdev->max_mtu = MAX_SINGLE_PACKET_SIZE;
 
-	netif_set_gso_max_size(netdev, LAN78XX_TSO_SIZE(dev));
+	netif_set_tso_max_size(netdev, LAN78XX_TSO_SIZE(dev));
 
 	netif_napi_add(netdev, &dev->napi, lan78xx_poll, NAPI_POLL_WEIGHT);
 
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index ee41088c5251..d3b53db57c26 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -9658,7 +9658,7 @@ static int rtl8152_probe(struct usb_interface *intf,
 	}
 
 	netdev->ethtool_ops = &ops;
-	netif_set_gso_max_size(netdev, RTL_LIMITED_TSO_SIZE);
+	netif_set_tso_max_size(netdev, RTL_LIMITED_TSO_SIZE);
 
 	/* MTU range: 68 - 1500 or 9194 */
 	netdev->min_mtu = ETH_MIN_MTU;
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 92698f79a4e0..2d4436cbcb47 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -1129,7 +1129,7 @@ static int qeth_l2_setup_netdev(struct qeth_card *card)
 	if (card->dev->hw_features & (NETIF_F_TSO | NETIF_F_TSO6)) {
 		card->dev->needed_headroom = sizeof(struct qeth_hdr_tso);
 		netif_keep_dst(card->dev);
-		netif_set_gso_max_size(card->dev,
+		netif_set_tso_max_size(card->dev,
 				       PAGE_SIZE * (QDIO_MAX_ELEMENTS_PER_BUFFER - 1));
 	}
 
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index ea3b6b18aa6e..8d44bce0477a 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1907,7 +1907,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
 
 	netif_keep_dst(card->dev);
 	if (card->dev->hw_features & (NETIF_F_TSO | NETIF_F_TSO6))
-		netif_set_gso_max_size(card->dev,
+		netif_set_tso_max_size(card->dev,
 				       PAGE_SIZE * (QETH_MAX_BUFFER_ELEMENTS(card) - 1));
 
 	netif_napi_add(card->dev, &card->napi, qeth_poll, NAPI_POLL_WEIGHT);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 55f47cadb114..47fcbade7389 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -517,16 +517,16 @@ void br_mtu_auto_adjust(struct net_bridge *br)
 
 static void br_set_gso_limits(struct net_bridge *br)
 {
-	unsigned int gso_max_size = GSO_MAX_SIZE;
-	u16 gso_max_segs = GSO_MAX_SEGS;
+	unsigned int tso_max_size = TSO_MAX_SIZE;
 	const struct net_bridge_port *p;
+	u16 tso_max_segs = TSO_MAX_SEGS;
 
 	list_for_each_entry(p, &br->port_list, list) {
-		gso_max_size = min(gso_max_size, p->dev->gso_max_size);
-		gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs);
+		tso_max_size = min(tso_max_size, p->dev->tso_max_size);
+		tso_max_segs = min(tso_max_segs, p->dev->tso_max_segs);
 	}
-	netif_set_gso_max_size(br->dev, gso_max_size);
-	netif_set_gso_max_segs(br->dev, gso_max_segs);
+	netif_set_tso_max_size(br->dev, tso_max_size);
+	netif_set_tso_max_segs(br->dev, tso_max_segs);
 }
 
 /*
diff --git a/net/core/dev.c b/net/core/dev.c
index 6ae085b11373..f036ccb61da4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3032,8 +3032,8 @@ EXPORT_SYMBOL(netif_set_tso_max_segs);
  */
 void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
 {
-	netif_set_gso_max_size(to, from->gso_max_size);
-	netif_set_gso_max_segs(to, from->gso_max_segs);
+	netif_set_tso_max_size(to, from->tso_max_size);
+	netif_set_tso_max_segs(to, from->tso_max_segs);
 }
 EXPORT_SYMBOL(netif_inherit_tso_max);
 
-- 
cgit v1.2.3-59-g8ed1b


From be76955dea93fe7ee9e0a6f961a7185290a2417f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 9 May 2022 10:04:12 -0700
Subject: net: fix kdoc on __dev_queue_xmit()

Commit c526fd8f9f4f21 ("net: inline dev_queue_xmit()") exported
__dev_queue_xmit(), now it's being rendered in html docs, triggering:

Documentation/networking/kapi:92: net/core/dev.c:4101: WARNING: Missing matching underline for section title overline.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/linux-next/20220503073420.6d3f135d@canb.auug.org.au/
Fixes: c526fd8f9f4f21 ("net: inline dev_queue_xmit()")
Link: https://lore.kernel.org/r/20220509170412.1069190-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/dev.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index f036ccb61da4..186c6b07c60c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4131,30 +4131,25 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 }
 
 /**
- *	__dev_queue_xmit - transmit a buffer
- *	@skb: buffer to transmit
- *	@sb_dev: suboordinate device used for L2 forwarding offload
- *
- *	Queue a buffer for transmission to a network device. The caller must
- *	have set the device and priority and built the buffer before calling
- *	this function. The function can be called from an interrupt.
+ * __dev_queue_xmit() - transmit a buffer
+ * @skb:	buffer to transmit
+ * @sb_dev:	suboordinate device used for L2 forwarding offload
  *
- *	A negative errno code is returned on a failure. A success does not
- *	guarantee the frame will be transmitted as it may be dropped due
- *	to congestion or traffic shaping.
+ * Queue a buffer for transmission to a network device. The caller must
+ * have set the device and priority and built the buffer before calling
+ * this function. The function can be called from an interrupt.
  *
- * -----------------------------------------------------------------------------------
- *      I notice this method can also return errors from the queue disciplines,
- *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
- *      be positive.
+ * When calling this method, interrupts MUST be enabled. This is because
+ * the BH enable code must have IRQs enabled so that it will not deadlock.
  *
- *      Regardless of the return value, the skb is consumed, so it is currently
- *      difficult to retry a send to this method.  (You can bump the ref count
- *      before sending to hold a reference for retry if you are careful.)
+ * Regardless of the return value, the skb is consumed, so it is currently
+ * difficult to retry a send to this method. (You can bump the ref count
+ * before sending to hold a reference for retry if you are careful.)
  *
- *      When calling this method, interrupts MUST be enabled.  This is because
- *      the BH enable code must have IRQs enabled so that it will not deadlock.
- *          --BLG
+ * Return:
+ * * 0				- buffer successfully transmitted
+ * * positive qdisc return code	- NET_XMIT_DROP etc.
+ * * negative errno		- other errors
  */
 int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 {
-- 
cgit v1.2.3-59-g8ed1b


From d7ea0d9df2a6265b2b180d17ebc64b38105968fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 9 May 2022 20:57:40 -0700
Subject: net: remove two BUG() from skb_checksum_help()

I have a syzbot report that managed to get a crash in skb_checksum_help()

If syzbot can trigger these BUG(), it makes sense to replace
them with more friendly WARN_ON_ONCE() since skb_checksum_help()
can instead return an error code.

Note that syzbot will still crash there, until real bug is fixed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 186c6b07c60c..918548c73872 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3265,11 +3265,15 @@ int skb_checksum_help(struct sk_buff *skb)
 	}
 
 	offset = skb_checksum_start_offset(skb);
-	BUG_ON(offset >= skb_headlen(skb));
+	ret = -EINVAL;
+	if (WARN_ON_ONCE(offset >= skb_headlen(skb)))
+		goto out;
+
 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 
 	offset += skb->csum_offset;
-	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
+	if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb)))
+		goto out;
 
 	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 	if (ret)
-- 
cgit v1.2.3-59-g8ed1b


From eeee4b77dc52b9128ee450c110caeb92ce3f3f9c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 9 May 2022 20:57:41 -0700
Subject: net: add more debug info in skb_checksum_help()

This is a followup of previous patch.

Dumping the stack trace is a good start, but printing
basic skb information is probably better.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 918548c73872..91b7e7784da9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3266,15 +3266,17 @@ int skb_checksum_help(struct sk_buff *skb)
 
 	offset = skb_checksum_start_offset(skb);
 	ret = -EINVAL;
-	if (WARN_ON_ONCE(offset >= skb_headlen(skb)))
+	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
+		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 		goto out;
-
+	}
 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
 
 	offset += skb->csum_offset;
-	if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb)))
+	if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) {
+		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
 		goto out;
-
+	}
 	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
 	if (ret)
 		goto out;
-- 
cgit v1.2.3-59-g8ed1b


From fa926bb3e491221a76bd476a990019cd55df8a30 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 11 May 2022 12:07:20 -0700
Subject: net: update the register_netdevice() kdoc

The BUGS section looks quite dated, the registration
is under rtnl lock. Remove some obvious information
while at it.

Link: https://lore.kernel.org/r/20220511190720.1401356-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/dev.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 91b7e7784da9..a601da3b4a7c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9927,22 +9927,14 @@ void netif_tx_stop_all_queues(struct net_device *dev)
 EXPORT_SYMBOL(netif_tx_stop_all_queues);
 
 /**
- *	register_netdevice	- register a network device
- *	@dev: device to register
- *
- *	Take a completed network device structure and add it to the kernel
- *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
- *	chain. 0 is returned on success. A negative errno code is returned
- *	on a failure to set up the device, or if the name is a duplicate.
+ * register_netdevice() - register a network device
+ * @dev: device to register
  *
- *	Callers must hold the rtnl semaphore. You may want
- *	register_netdev() instead of this.
- *
- *	BUGS:
- *	The locking appears insufficient to guarantee two parallel registers
- *	will not get the same name.
+ * Take a prepared network device structure and make it externally accessible.
+ * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
+ * Callers must hold the rtnl lock - you may want register_netdev()
+ * instead of this.
  */
-
 int register_netdevice(struct net_device *dev)
 {
 	int ret;
-- 
cgit v1.2.3-59-g8ed1b


From 7c4e983c4f3cf94fcd879730c6caa877e0768a4d Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Fri, 13 May 2022 11:33:57 -0700
Subject: net: allow gso_max_size to exceed 65536

The code for gso_max_size was added originally to allow for debugging and
workaround of buggy devices that couldn't support TSO with blocks 64K in
size. The original reason for limiting it to 64K was because that was the
existing limits of IPv4 and non-jumbogram IPv6 length fields.

With the addition of Big TCP we can remove this limit and allow the value
to potentially go up to UINT_MAX and instead be limited by the tso_max_size
value.

So in order to support this we need to go through and clean up the
remaining users of the gso_max_size value so that the values will cap at
64K for non-TCPv6 flows. In addition we can clean up the GSO_MAX_SIZE value
so that 64K becomes GSO_LEGACY_MAX_SIZE and UINT_MAX will now be the upper
limit for GSO_MAX_SIZE.

v6: (edumazet) fixed a compile error if CONFIG_IPV6=n,
               in a new sk_trim_gso_size() helper.
               netif_set_tso_max_size() caps the requested TSO size
               with GSO_MAX_SIZE.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe.h            |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |  2 +-
 drivers/net/ethernet/sfc/ef100_nic.c            |  3 ++-
 drivers/net/ethernet/sfc/falcon/tx.c            |  3 ++-
 drivers/net/ethernet/sfc/tx_common.c            |  3 ++-
 drivers/net/ethernet/synopsys/dwc-xlgmac.h      |  3 ++-
 drivers/net/hyperv/rndis_filter.c               |  2 +-
 drivers/scsi/fcoe/fcoe.c                        |  2 +-
 include/linux/netdevice.h                       |  4 +++-
 net/bpf/test_run.c                              |  2 +-
 net/core/dev.c                                  |  7 ++++---
 net/core/rtnetlink.c                            |  2 +-
 net/core/sock.c                                 | 14 ++++++++++++++
 net/ipv4/tcp_bbr.c                              |  2 +-
 net/ipv4/tcp_output.c                           |  2 +-
 net/sctp/output.c                               |  3 ++-
 16 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 607a2c90513b..d9547552ceef 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -151,7 +151,8 @@
 #define XGBE_TX_MAX_BUF_SIZE	(0x3fff & ~(64 - 1))
 
 /* Descriptors required for maximum contiguous TSO/GSO packet */
-#define XGBE_TX_MAX_SPLIT	((GSO_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1)
+#define XGBE_TX_MAX_SPLIT	\
+	((GSO_LEGACY_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1)
 
 /* Maximum possible descriptors needed for an SKB:
  * - Maximum number of SKB frags
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index fb11081001a0..838870bc6dbd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -2038,7 +2038,7 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt)
 {
 	int nr_frags = skb_shinfo(skb)->nr_frags;
 
-	return PAGE_SIZE * nr_frags + data_bcnt <= GSO_MAX_SIZE;
+	return PAGE_SIZE * nr_frags + data_bcnt <= GRO_MAX_SIZE;
 }
 
 static void
diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c
index a69d756e09b9..b2536d2c218a 100644
--- a/drivers/net/ethernet/sfc/ef100_nic.c
+++ b/drivers/net/ethernet/sfc/ef100_nic.c
@@ -1008,7 +1008,8 @@ static int ef100_process_design_param(struct efx_nic *efx,
 		}
 		return 0;
 	case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_LEN:
-		nic_data->tso_max_payload_len = min_t(u64, reader->value, GSO_MAX_SIZE);
+		nic_data->tso_max_payload_len = min_t(u64, reader->value,
+						      GSO_LEGACY_MAX_SIZE);
 		netif_set_tso_max_size(efx->net_dev,
 				       nic_data->tso_max_payload_len);
 		return 0;
diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c
index f7306e93a8b8..b9369483758c 100644
--- a/drivers/net/ethernet/sfc/falcon/tx.c
+++ b/drivers/net/ethernet/sfc/falcon/tx.c
@@ -98,7 +98,8 @@ unsigned int ef4_tx_max_skb_descs(struct ef4_nic *efx)
 	/* Possibly more for PCIe page boundaries within input fragments */
 	if (PAGE_SIZE > EF4_PAGE_SIZE)
 		max_descs += max_t(unsigned int, MAX_SKB_FRAGS,
-				   DIV_ROUND_UP(GSO_MAX_SIZE, EF4_PAGE_SIZE));
+				   DIV_ROUND_UP(GSO_LEGACY_MAX_SIZE,
+						EF4_PAGE_SIZE));
 
 	return max_descs;
 }
diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c
index 9bc8281b7f5b..658ea2d34070 100644
--- a/drivers/net/ethernet/sfc/tx_common.c
+++ b/drivers/net/ethernet/sfc/tx_common.c
@@ -416,7 +416,8 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
 	/* Possibly more for PCIe page boundaries within input fragments */
 	if (PAGE_SIZE > EFX_PAGE_SIZE)
 		max_descs += max_t(unsigned int, MAX_SKB_FRAGS,
-				   DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE));
+				   DIV_ROUND_UP(GSO_LEGACY_MAX_SIZE,
+						EFX_PAGE_SIZE));
 
 	return max_descs;
 }
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac.h b/drivers/net/ethernet/synopsys/dwc-xlgmac.h
index 98e3a271e017..a848e10f3ea4 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac.h
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac.h
@@ -38,7 +38,8 @@
 #define XLGMAC_RX_DESC_MAX_DIRTY	(XLGMAC_RX_DESC_CNT >> 3)
 
 /* Descriptors required for maximum contiguous TSO/GSO packet */
-#define XLGMAC_TX_MAX_SPLIT	((GSO_MAX_SIZE / XLGMAC_TX_MAX_BUF_SIZE) + 1)
+#define XLGMAC_TX_MAX_SPLIT	\
+	((GSO_LEGACY_MAX_SIZE / XLGMAC_TX_MAX_BUF_SIZE) + 1)
 
 /* Maximum possible descriptors needed for a SKB */
 #define XLGMAC_TX_MAX_DESC_NR	(MAX_SKB_FRAGS + XLGMAC_TX_MAX_SPLIT + 2)
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 866af2cc27a3..6da36cb8af80 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1349,7 +1349,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device,
 	struct net_device_context *net_device_ctx = netdev_priv(net);
 	struct ndis_offload hwcaps;
 	struct ndis_offload_params offloads;
-	unsigned int gso_max_size = GSO_MAX_SIZE;
+	unsigned int gso_max_size = GSO_LEGACY_MAX_SIZE;
 	int ret;
 
 	/* Find HW offload capabilities */
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 44ca6110213c..79b2827e4081 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -667,7 +667,7 @@ static void fcoe_netdev_features_change(struct fc_lport *lport,
 
 	if (netdev->features & NETIF_F_FSO) {
 		lport->seq_offload = 1;
-		lport->lso_max = netdev->gso_max_size;
+		lport->lso_max = min(netdev->gso_max_size, GSO_LEGACY_MAX_SIZE);
 		FCOE_NETDEV_DBG(netdev, "Supports LSO for max len 0x%x\n",
 				lport->lso_max);
 	} else {
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 536321691c72..ce780e352f43 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2272,7 +2272,9 @@ struct net_device {
 	const struct rtnl_link_ops *rtnl_link_ops;
 
 	/* for setting kernel sock attribute on TCP connection setup */
-#define GSO_MAX_SIZE		65536
+#define GSO_LEGACY_MAX_SIZE	65536u
+#define GSO_MAX_SIZE		UINT_MAX
+
 	unsigned int		gso_max_size;
 #define TSO_LEGACY_MAX_SIZE	65536
 #define TSO_MAX_SIZE		UINT_MAX
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 8d54fef9a568..9b5a1f630bb0 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -1001,7 +1001,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
 		cb->pkt_len = skb->len;
 	} else {
 		if (__skb->wire_len < skb->len ||
-		    __skb->wire_len > GSO_MAX_SIZE)
+		    __skb->wire_len > GSO_LEGACY_MAX_SIZE)
 			return -EINVAL;
 		cb->pkt_len = __skb->wire_len;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index a601da3b4a7c..830beb05161a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2998,11 +2998,12 @@ EXPORT_SYMBOL(netif_set_real_num_queues);
  * @size:	max skb->len of a TSO frame
  *
  * Set the limit on the size of TSO super-frames the device can handle.
- * Unless explicitly set the stack will assume the value of %GSO_MAX_SIZE.
+ * Unless explicitly set the stack will assume the value of
+ * %GSO_LEGACY_MAX_SIZE.
  */
 void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
 {
-	dev->tso_max_size = size;
+	dev->tso_max_size = min(GSO_MAX_SIZE, size);
 	if (size < READ_ONCE(dev->gso_max_size))
 		netif_set_gso_max_size(dev, size);
 }
@@ -10595,7 +10596,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev_net_set(dev, &init_net);
 
-	dev->gso_max_size = GSO_MAX_SIZE;
+	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
 	dev->gro_max_size = GRO_MAX_SIZE;
 	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f35cc21298ac..f2b0f747d3d2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2817,7 +2817,7 @@ static int do_setlink(const struct sk_buff *skb,
 	if (tb[IFLA_GSO_MAX_SIZE]) {
 		u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
 
-		if (max_size > GSO_MAX_SIZE || max_size > dev->tso_max_size) {
+		if (max_size > dev->tso_max_size) {
 			err = -EINVAL;
 			goto errout;
 		}
diff --git a/net/core/sock.c b/net/core/sock.c
index 6b287eb5427b..24a46a1e4f28 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2293,6 +2293,19 @@ void sk_free_unlock_clone(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
 
+static void sk_trim_gso_size(struct sock *sk)
+{
+	if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
+		return;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6 &&
+	    sk_is_tcp(sk) &&
+	    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+		return;
+#endif
+	sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
+}
+
 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
 	u32 max_segs = 1;
@@ -2312,6 +2325,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
 			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
+			sk_trim_gso_size(sk);
 			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index c7d30a3bbd81..075e744bfb48 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -310,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
 	 */
 	bytes = min_t(unsigned long,
 		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
-		      GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
+		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
 	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
 	return min(segs, 0x7FU);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b092228e4342..b4b2284ed4a2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1553,7 +1553,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 	 * SO_SNDBUF values.
 	 * Also allow first and last skb in retransmit queue to be split.
 	 */
-	limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
+	limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
 	if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
 		     tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
 		     skb != tcp_rtx_queue_head(sk) &&
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 72fe6669c50d..a63df055ac57 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -134,7 +134,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag,
 		dst_hold(tp->dst);
 		sk_setup_caps(sk, tp->dst);
 	}
-	packet->max_size = sk_can_gso(sk) ? READ_ONCE(tp->dst->dev->gso_max_size)
+	packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size),
+						GSO_LEGACY_MAX_SIZE)
 					  : asoc->pathmtu;
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3-59-g8ed1b


From 0fe79f28bfaf73b66b7b1562d2468f94aa03bd12 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Fri, 13 May 2022 11:34:03 -0700
Subject: net: allow gro_max_size to exceed 65536

Allow the gro_max_size to exceed a value larger than 65536.

There weren't really any external limitations that prevented this other
than the fact that IPv4 only supports a 16 bit length field. Since we have
the option of adding a hop-by-hop header for IPv6 we can allow IPv6 to
exceed this value and for IPv4 and non-TCP flows we can cap things at 65536
via a constant rather than relying on gro_max_size.

[edumazet] limit GRO_MAX_SIZE to (8 * 65535) to avoid overflows.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +-
 include/linux/netdevice.h                       | 6 +++++-
 include/net/ipv6.h                              | 2 +-
 net/core/dev.c                                  | 2 +-
 net/core/gro.c                                  | 8 ++++++++
 net/core/rtnetlink.c                            | 8 --------
 6 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 838870bc6dbd..24de37b79f5a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -2038,7 +2038,7 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt)
 {
 	int nr_frags = skb_shinfo(skb)->nr_frags;
 
-	return PAGE_SIZE * nr_frags + data_bcnt <= GRO_MAX_SIZE;
+	return PAGE_SIZE * nr_frags + data_bcnt <= GRO_LEGACY_MAX_SIZE;
 }
 
 static void
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fd38847d0dc7..d57ce248004c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2161,7 +2161,11 @@ struct net_device {
 	struct bpf_prog __rcu	*xdp_prog;
 	unsigned long		gro_flush_timeout;
 	int			napi_defer_hard_irqs;
-#define GRO_MAX_SIZE		65536
+#define GRO_LEGACY_MAX_SIZE	65536u
+/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
+ * and shinfo->gso_segs is a 16bit field.
+ */
+#define GRO_MAX_SIZE		(8 * 65535u)
 	unsigned int		gro_max_size;
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index b6df0314aa02..5b38bf1a586b 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -477,7 +477,7 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb)
 	const struct hop_jumbo_hdr *jhdr;
 	const struct ipv6hdr *nhdr;
 
-	if (likely(skb->len <= GRO_MAX_SIZE))
+	if (likely(skb->len <= GRO_LEGACY_MAX_SIZE))
 		return 0;
 
 	if (skb->protocol != htons(ETH_P_IPV6))
diff --git a/net/core/dev.c b/net/core/dev.c
index 830beb05161a..d93456c75b55 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10598,7 +10598,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
 	dev->gso_max_segs = GSO_MAX_SEGS;
-	dev->gro_max_size = GRO_MAX_SIZE;
+	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
 	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
 	dev->tso_max_segs = TSO_MAX_SEGS;
 	dev->upper_level = 1;
diff --git a/net/core/gro.c b/net/core/gro.c
index 78110edf5d4b..b4190eb08467 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -167,6 +167,14 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 	if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
 		return -E2BIG;
 
+	if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
+		if (p->protocol != htons(ETH_P_IPV6) ||
+		    skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
+		    ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
+		    p->encapsulation)
+			return -E2BIG;
+	}
+
 	lp = NAPI_GRO_CB(p)->last;
 	pinfo = skb_shinfo(lp);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f2b0f747d3d2..ac45328607f7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2360,14 +2360,6 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
 		}
 	}
 
-	if (tb[IFLA_GRO_MAX_SIZE]) {
-		u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]);
-
-		if (gro_max_size > GRO_MAX_SIZE) {
-			NL_SET_ERR_MSG(extack, "too big gro_max_size");
-			return -EINVAL;
-		}
-	}
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 97e719a82b43c6c2bb5eebdb3c5d479a332ac2ac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 15 May 2022 21:24:53 -0700
Subject: net: fix possible race in skb_attempt_defer_free()

A cpu can observe sd->defer_count reaching 128,
and call smp_call_function_single_async()

Problem is that the remote CPU can clear sd->defer_count
before the IPI is run/acknowledged.

Other cpus can queue more packets and also decide
to call smp_call_function_single_async() while the pending
IPI was not yet delivered.

This is a common issue with smp_call_function_single_async().
Callers must ensure correct synchronization and serialization.

I triggered this issue while experimenting smaller threshold.
Performing the call to smp_call_function_single_async()
under sd->defer_lock protection did not solve the problem.

Commit 5a18ceca6350 ("smp: Allow smp_call_function_single_async()
to insert locked csd") replaced an informative WARN_ON_ONCE()
with a return of -EBUSY, which is often ignored.
Test of CSD_FLAG_LOCK presence is racy anyway.

Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.c            | 7 +++++--
 net/core/skbuff.c         | 5 ++---
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d57ce248004c..cbaf312e365b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3136,6 +3136,7 @@ struct softnet_data {
 	/* Another possibly contended cache line */
 	spinlock_t		defer_lock ____cacheline_aligned_in_smp;
 	int			defer_count;
+	int			defer_ipi_scheduled;
 	struct sk_buff		*defer_list;
 	call_single_data_t	defer_csd;
 };
diff --git a/net/core/dev.c b/net/core/dev.c
index d93456c75b55..a5e663e1a75a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4582,9 +4582,12 @@ static void rps_trigger_softirq(void *data)
 #endif /* CONFIG_RPS */
 
 /* Called from hardirq (IPI) context */
-static void trigger_rx_softirq(void *data __always_unused)
+static void trigger_rx_softirq(void *data)
 {
+	struct softnet_data *sd = data;
+
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	smp_store_release(&sd->defer_ipi_scheduled, 0);
 }
 
 /*
@@ -11382,7 +11385,7 @@ static int __init net_dev_init(void)
 		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
 		sd->cpu = i;
 #endif
-		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL);
+		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
 		spin_lock_init(&sd->defer_lock);
 
 		init_gro_hash(&sd->backlog);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2fea964f09d8..b40c8cdf4785 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6516,8 +6516,7 @@ void skb_attempt_defer_free(struct sk_buff *skb)
 	sd->defer_count++;
 
 	/* kick every time queue length reaches 128.
-	 * This should avoid blocking in smp_call_function_single_async().
-	 * This condition should hardly be bit under normal conditions,
+	 * This condition should hardly be hit under normal conditions,
 	 * unless cpu suddenly stopped to receive NIC interrupts.
 	 */
 	kick = sd->defer_count == 128;
@@ -6527,6 +6526,6 @@ void skb_attempt_defer_free(struct sk_buff *skb)
 	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
 	 * if we are unlucky enough (this seems very unlikely).
 	 */
-	if (unlikely(kick))
+	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
 		smp_call_function_single_async(cpu, &sd->defer_csd);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2db60eed1a957423cf06ee1060fc45ed3971990d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 15 May 2022 21:24:54 -0700
Subject: net: use napi_consume_skb() in skb_defer_free_flush()

skb_defer_free_flush() runs from softirq context,
we have the opportunity to refill the napi_alloc_cache,
and/or use kmem_cache_free_bulk() when this cache is full.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index a5e663e1a75a..d0b34bc50706 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6633,7 +6633,7 @@ static void skb_defer_free_flush(struct softnet_data *sd)
 
 	while (skb != NULL) {
 		next = skb->next;
-		__kfree_skb(skb);
+		napi_consume_skb(skb, 1);
 		skb = next;
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 39564c3fdc6684c6726b63e131d2a9f3809811cb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 15 May 2022 21:24:55 -0700
Subject: net: add skb_defer_max sysctl

commit 68822bdf76f1 ("net: generalize skb freeing
deferral to per-cpu lists") added another per-cpu
cache of skbs. It was expected to be small,
and an IPI was forced whenever the list reached 128
skbs.

We might need to be able to control more precisely
queue capacity and added latency.

An IPI is generated whenever queue reaches half capacity.

Default value of the new limit is 64.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/admin-guide/sysctl/net.rst |  8 ++++++++
 net/core/dev.c                           |  1 +
 net/core/dev.h                           |  2 +-
 net/core/skbuff.c                        | 15 +++++++++------
 net/core/sysctl_net_core.c               |  8 ++++++++
 5 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index f86b5e1623c6..fa4dcdb283cf 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -322,6 +322,14 @@ a leaked reference faster. A larger value may be useful to prevent false
 warnings on slow/loaded systems.
 Default value is 10, minimum 1, maximum 3600.
 
+skb_defer_max
+-------------
+
+Max size (in skbs) of the per-cpu list of skbs being freed
+by the cpu which allocated them. Used by TCP stack so far.
+
+Default: 64
+
 optmem_max
 ----------
 
diff --git a/net/core/dev.c b/net/core/dev.c
index d0b34bc50706..6359f8953269 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4330,6 +4330,7 @@ int netdev_max_backlog __read_mostly = 1000;
 EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
+unsigned int sysctl_skb_defer_max __read_mostly = 64;
 int netdev_budget __read_mostly = 300;
 /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
diff --git a/net/core/dev.h b/net/core/dev.h
index 328b37af90ba..cbb8a925175a 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -39,7 +39,7 @@ void dev_addr_check(struct net_device *dev);
 /* sysctls not referred to from outside net/core/ */
 extern int		netdev_budget;
 extern unsigned int	netdev_budget_usecs;
-
+extern unsigned int	sysctl_skb_defer_max;
 extern int		netdev_tstamp_prequeue;
 extern int		netdev_unregister_timeout_secs;
 extern int		weight_p;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b40c8cdf4785..1d10bb4adec1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -80,6 +80,7 @@
 #include <linux/user_namespace.h>
 #include <linux/indirect_call_wrapper.h>
 
+#include "dev.h"
 #include "sock_destructor.h"
 
 struct kmem_cache *skbuff_head_cache __ro_after_init;
@@ -6496,16 +6497,21 @@ void skb_attempt_defer_free(struct sk_buff *skb)
 	int cpu = skb->alloc_cpu;
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int defer_max;
 	bool kick;
 
 	if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
 	    !cpu_online(cpu) ||
 	    cpu == raw_smp_processor_id()) {
-		__kfree_skb(skb);
+nodefer:	__kfree_skb(skb);
 		return;
 	}
 
 	sd = &per_cpu(softnet_data, cpu);
+	defer_max = READ_ONCE(sysctl_skb_defer_max);
+	if (READ_ONCE(sd->defer_count) >= defer_max)
+		goto nodefer;
+
 	/* We do not send an IPI or any signal.
 	 * Remote cpu will eventually call skb_defer_free_flush()
 	 */
@@ -6515,11 +6521,8 @@ void skb_attempt_defer_free(struct sk_buff *skb)
 	WRITE_ONCE(sd->defer_list, skb);
 	sd->defer_count++;
 
-	/* kick every time queue length reaches 128.
-	 * This condition should hardly be hit under normal conditions,
-	 * unless cpu suddenly stopped to receive NIC interrupts.
-	 */
-	kick = sd->defer_count == 128;
+	/* Send an IPI every time queue reaches half capacity. */
+	kick = sd->defer_count == (defer_max >> 1);
 
 	spin_unlock_irqrestore(&sd->defer_lock, flags);
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 195ca5c28771..ca8d38325e1e 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -578,6 +578,14 @@ static struct ctl_table net_core_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= &int_3600,
 	},
+	{
+		.procname	= "skb_defer_max",
+		.data		= &sysctl_skb_defer_max,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
 	{ }
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 909876500251b3b48480a840bbf9053588254eee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 15 May 2022 21:24:56 -0700
Subject: net: call skb_defer_free_flush() before each napi_poll()

skb_defer_free_flush() can consume cpu cycles,
it seems better to call it in the inner loop:

- Potentially frees page/skb that will be reallocated while hot.

- Account for the cpu cycles in the @time_limit determination.

- Keep softnet_data.defer_count small to reduce chances for
  skb_attempt_defer_free() to send an IPI.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6359f8953269..04fd056f7f74 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6655,6 +6655,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 	for (;;) {
 		struct napi_struct *n;
 
+		skb_defer_free_flush(sd);
+
 		if (list_empty(&list)) {
 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 				goto end;
@@ -6684,8 +6686,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
 	net_rps_action_and_irq_enable(sd);
-end:
-	skb_defer_free_flush(sd);
+end:;
 }
 
 struct netdev_adjacent {
-- 
cgit v1.2.3-59-g8ed1b