From be2644aac3e1db02d09f45d56206bbdafca582a2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 1 Oct 2019 10:49:06 -0700
Subject: tcp: add ipv6_addr_v4mapped_loopback() helper

tcp_twsk_unique() has a hard coded assumption about ipv4 loopback
being 127/8

Lets instead use the standard ipv4_is_loopback() method,
in a new ipv6_addr_v4mapped_loopback() helper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 009605c56f20..d04b7abe2a4c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -696,6 +696,11 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
 					cpu_to_be32(0x0000ffff))) == 0UL;
 }
 
+static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a)
+{
+	return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]);
+}
+
 static inline u32 ipv6_portaddr_hash(const struct net *net,
 				     const struct in6_addr *addr6,
 				     unsigned int port)
-- 
cgit v1.2.3-59-g8ed1b


From ff92741270bf8b6e78aa885f166b68c7a67ab13a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 30 Sep 2019 11:48:15 +0200
Subject: net: introduce name_node struct to be used in hashlist

Introduce name_node structure to hold name of device and put it into
hashlist instead of putting there struct net_device directly. Add a
necessary infrastructure to manipulate the hashlist. This prepares
the code to use the same hashlist for alternative names introduced
later in this set.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 ++++-
 net/core/dev.c            | 97 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 87 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9eda1c31d1f7..e92bc5467256 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -925,6 +925,12 @@ struct dev_ifalias {
 struct devlink;
 struct tlsdev_ops;
 
+struct netdev_name_node {
+	struct hlist_node hlist;
+	struct net_device *dev;
+	const char *name;
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1564,7 +1570,7 @@ enum netdev_priv_flags {
  *		(i.e. as seen by users in the "Space.c" file).  It is the name
  *		of the interface.
  *
- *	@name_hlist: 	Device name hash chain, please keep it close to name[]
+ *	@name_node:	Name hashlist node
  *	@ifalias:	SNMP alias
  *	@mem_end:	Shared memory end
  *	@mem_start:	Shared memory start
@@ -1774,7 +1780,7 @@ enum netdev_priv_flags {
 
 struct net_device {
 	char			name[IFNAMSIZ];
-	struct hlist_node	name_hlist;
+	struct netdev_name_node	*name_node;
 	struct dev_ifalias	__rcu *ifalias;
 	/*
 	 *	I/O specific fields
diff --git a/net/core/dev.c b/net/core/dev.c
index 21a9c2987cbb..d2053d07c94a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -228,6 +228,67 @@ static inline void rps_unlock(struct softnet_data *sd)
 #endif
 }
 
+static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
+						       const char *name)
+{
+	struct netdev_name_node *name_node;
+
+	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
+	if (!name_node)
+		return NULL;
+	INIT_HLIST_NODE(&name_node->hlist);
+	name_node->dev = dev;
+	name_node->name = name;
+	return name_node;
+}
+
+static struct netdev_name_node *
+netdev_name_node_head_alloc(struct net_device *dev)
+{
+	return netdev_name_node_alloc(dev, dev->name);
+}
+
+static void netdev_name_node_free(struct netdev_name_node *name_node)
+{
+	kfree(name_node);
+}
+
+static void netdev_name_node_add(struct net *net,
+				 struct netdev_name_node *name_node)
+{
+	hlist_add_head_rcu(&name_node->hlist,
+			   dev_name_hash(net, name_node->name));
+}
+
+static void netdev_name_node_del(struct netdev_name_node *name_node)
+{
+	hlist_del_rcu(&name_node->hlist);
+}
+
+static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
+							const char *name)
+{
+	struct hlist_head *head = dev_name_hash(net, name);
+	struct netdev_name_node *name_node;
+
+	hlist_for_each_entry(name_node, head, hlist)
+		if (!strcmp(name_node->name, name))
+			return name_node;
+	return NULL;
+}
+
+static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
+							    const char *name)
+{
+	struct hlist_head *head = dev_name_hash(net, name);
+	struct netdev_name_node *name_node;
+
+	hlist_for_each_entry_rcu(name_node, head, hlist)
+		if (!strcmp(name_node->name, name))
+			return name_node;
+	return NULL;
+}
+
 /* Device list insertion */
 static void list_netdevice(struct net_device *dev)
 {
@@ -237,7 +298,7 @@ static void list_netdevice(struct net_device *dev)
 
 	write_lock_bh(&dev_base_lock);
 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
-	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+	netdev_name_node_add(net, dev->name_node);
 	hlist_add_head_rcu(&dev->index_hlist,
 			   dev_index_hash(net, dev->ifindex));
 	write_unlock_bh(&dev_base_lock);
@@ -255,7 +316,7 @@ static void unlist_netdevice(struct net_device *dev)
 	/* Unlink dev from the device chain */
 	write_lock_bh(&dev_base_lock);
 	list_del_rcu(&dev->dev_list);
-	hlist_del_rcu(&dev->name_hlist);
+	netdev_name_node_del(dev->name_node);
 	hlist_del_rcu(&dev->index_hlist);
 	write_unlock_bh(&dev_base_lock);
 
@@ -733,14 +794,10 @@ EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 
 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 {
-	struct net_device *dev;
-	struct hlist_head *head = dev_name_hash(net, name);
+	struct netdev_name_node *node_name;
 
-	hlist_for_each_entry(dev, head, name_hlist)
-		if (!strncmp(dev->name, name, IFNAMSIZ))
-			return dev;
-
-	return NULL;
+	node_name = netdev_name_node_lookup(net, name);
+	return node_name ? node_name->dev : NULL;
 }
 EXPORT_SYMBOL(__dev_get_by_name);
 
@@ -758,14 +815,10 @@ EXPORT_SYMBOL(__dev_get_by_name);
 
 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 {
-	struct net_device *dev;
-	struct hlist_head *head = dev_name_hash(net, name);
-
-	hlist_for_each_entry_rcu(dev, head, name_hlist)
-		if (!strncmp(dev->name, name, IFNAMSIZ))
-			return dev;
+	struct netdev_name_node *node_name;
 
-	return NULL;
+	node_name = netdev_name_node_lookup_rcu(net, name);
+	return node_name ? node_name->dev : NULL;
 }
 EXPORT_SYMBOL(dev_get_by_name_rcu);
 
@@ -1232,13 +1285,13 @@ rollback:
 	netdev_adjacent_rename_links(dev, oldname);
 
 	write_lock_bh(&dev_base_lock);
-	hlist_del_rcu(&dev->name_hlist);
+	netdev_name_node_del(dev->name_node);
 	write_unlock_bh(&dev_base_lock);
 
 	synchronize_rcu();
 
 	write_lock_bh(&dev_base_lock);
-	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+	netdev_name_node_add(net, dev->name_node);
 	write_unlock_bh(&dev_base_lock);
 
 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
@@ -8264,6 +8317,8 @@ static void rollback_registered_many(struct list_head *head)
 		dev_uc_flush(dev);
 		dev_mc_flush(dev);
 
+		netdev_name_node_free(dev->name_node);
+
 		if (dev->netdev_ops->ndo_uninit)
 			dev->netdev_ops->ndo_uninit(dev);
 
@@ -8706,6 +8761,10 @@ int register_netdevice(struct net_device *dev)
 	if (ret < 0)
 		goto out;
 
+	dev->name_node = netdev_name_node_head_alloc(dev);
+	if (!dev->name_node)
+		goto out;
+
 	/* Init, if this function is available */
 	if (dev->netdev_ops->ndo_init) {
 		ret = dev->netdev_ops->ndo_init(dev);
@@ -8827,6 +8886,8 @@ out:
 	return ret;
 
 err_uninit:
+	if (dev->name_node)
+		netdev_name_node_free(dev->name_node);
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
 	if (dev->priv_destructor)
-- 
cgit v1.2.3-59-g8ed1b


From 36fbf1e52bd3ff8a5cb604955eedfc9350c2e6cc Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 30 Sep 2019 11:48:16 +0200
Subject: net: rtnetlink: add linkprop commands to add and delete alternative
 ifnames

Add two commands to add and delete list of link properties. Implement
the first property type along - alternative ifnames.
Each net device can have multiple alternative names.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h      |   4 ++
 include/uapi/linux/if.h        |   1 +
 include/uapi/linux/if_link.h   |   2 +
 include/uapi/linux/rtnetlink.h |   7 +++
 net/core/dev.c                 |  58 ++++++++++++++++++++++-
 net/core/rtnetlink.c           | 103 +++++++++++++++++++++++++++++++++++++++++
 security/selinux/nlmsgtab.c    |   4 +-
 7 files changed, 177 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e92bc5467256..48cc71aae466 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -927,10 +927,14 @@ struct tlsdev_ops;
 
 struct netdev_name_node {
 	struct hlist_node hlist;
+	struct list_head list;
 	struct net_device *dev;
 	const char *name;
 };
 
+int netdev_name_node_alt_create(struct net_device *dev, const char *name);
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index 7fea0fd7d6f5..4bf33344aab1 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -33,6 +33,7 @@
 #define	IFNAMSIZ	16
 #endif /* __UAPI_DEF_IF_IFNAMSIZ */
 #define	IFALIASZ	256
+#define	ALTIFNAMSIZ	128
 #include <linux/hdlc/ioctl.h>
 
 /* For glibc compatibility. An empty enum does not compile. */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 4a8c02cafa9a..8aec8769d944 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -167,6 +167,8 @@ enum {
 	IFLA_NEW_IFINDEX,
 	IFLA_MIN_MTU,
 	IFLA_MAX_MTU,
+	IFLA_PROP_LIST,
+	IFLA_ALT_IFNAME, /* Alternative ifname */
 	__IFLA_MAX
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index ce2a623abb75..1418a8362bb7 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -164,6 +164,13 @@ enum {
 	RTM_GETNEXTHOP,
 #define RTM_GETNEXTHOP	RTM_GETNEXTHOP
 
+	RTM_NEWLINKPROP = 108,
+#define RTM_NEWLINKPROP	RTM_NEWLINKPROP
+	RTM_DELLINKPROP,
+#define RTM_DELLINKPROP	RTM_DELLINKPROP
+	RTM_GETLINKPROP,
+#define RTM_GETLINKPROP	RTM_GETLINKPROP
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/net/core/dev.c b/net/core/dev.c
index d2053d07c94a..7a456c6a7ad8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -245,7 +245,13 @@ static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
 static struct netdev_name_node *
 netdev_name_node_head_alloc(struct net_device *dev)
 {
-	return netdev_name_node_alloc(dev, dev->name);
+	struct netdev_name_node *name_node;
+
+	name_node = netdev_name_node_alloc(dev, dev->name);
+	if (!name_node)
+		return NULL;
+	INIT_LIST_HEAD(&name_node->list);
+	return name_node;
 }
 
 static void netdev_name_node_free(struct netdev_name_node *name_node)
@@ -289,6 +295,55 @@ static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
 	return NULL;
 }
 
+int netdev_name_node_alt_create(struct net_device *dev, const char *name)
+{
+	struct netdev_name_node *name_node;
+	struct net *net = dev_net(dev);
+
+	name_node = netdev_name_node_lookup(net, name);
+	if (name_node)
+		return -EEXIST;
+	name_node = netdev_name_node_alloc(dev, name);
+	if (!name_node)
+		return -ENOMEM;
+	netdev_name_node_add(net, name_node);
+	/* The node that holds dev->name acts as a head of per-device list. */
+	list_add_tail(&name_node->list, &dev->name_node->list);
+
+	return 0;
+}
+EXPORT_SYMBOL(netdev_name_node_alt_create);
+
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+{
+	list_del(&name_node->list);
+	netdev_name_node_del(name_node);
+	kfree(name_node->name);
+	netdev_name_node_free(name_node);
+}
+
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
+{
+	struct netdev_name_node *name_node;
+	struct net *net = dev_net(dev);
+
+	name_node = netdev_name_node_lookup(net, name);
+	if (!name_node)
+		return -ENOENT;
+	__netdev_name_node_alt_destroy(name_node);
+
+	return 0;
+}
+EXPORT_SYMBOL(netdev_name_node_alt_destroy);
+
+static void netdev_name_node_alt_flush(struct net_device *dev)
+{
+	struct netdev_name_node *name_node, *tmp;
+
+	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
+		__netdev_name_node_alt_destroy(name_node);
+}
+
 /* Device list insertion */
 static void list_netdevice(struct net_device *dev)
 {
@@ -8317,6 +8372,7 @@ static void rollback_registered_many(struct list_head *head)
 		dev_uc_flush(dev);
 		dev_mc_flush(dev);
 
+		netdev_name_node_alt_flush(dev);
 		netdev_name_node_free(dev->name_node);
 
 		if (dev->netdev_ops->ndo_uninit)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1ee6460f8275..e13646993d82 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1750,6 +1750,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
 	[IFLA_MIN_MTU]		= { .type = NLA_U32 },
 	[IFLA_MAX_MTU]		= { .type = NLA_U32 },
+	[IFLA_PROP_LIST]	= { .type = NLA_NESTED },
+	[IFLA_ALT_IFNAME]	= { .type = NLA_STRING,
+				    .len = ALTIFNAMSIZ - 1 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -3373,6 +3376,103 @@ out:
 	return err;
 }
 
+static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
+			   bool *changed, struct netlink_ext_ack *extack)
+{
+	char *alt_ifname;
+	int err;
+
+	err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
+	if (err)
+		return err;
+
+	alt_ifname = nla_data(attr);
+	if (cmd == RTM_NEWLINKPROP) {
+		alt_ifname = kstrdup(alt_ifname, GFP_KERNEL);
+		if (!alt_ifname)
+			return -ENOMEM;
+		err = netdev_name_node_alt_create(dev, alt_ifname);
+		if (err) {
+			kfree(alt_ifname);
+			return err;
+		}
+	} else if (cmd == RTM_DELLINKPROP) {
+		err = netdev_name_node_alt_destroy(dev, alt_ifname);
+		if (err)
+			return err;
+	} else {
+		WARN_ON(1);
+		return 0;
+	}
+
+	*changed = true;
+	return 0;
+}
+
+static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh,
+			 struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *tb[IFLA_MAX + 1];
+	struct net_device *dev;
+	struct ifinfomsg *ifm;
+	bool changed = false;
+	struct nlattr *attr;
+	int err, rem;
+
+	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
+	if (err)
+		return err;
+
+	err = rtnl_ensure_unique_netns(tb, extack, true);
+	if (err)
+		return err;
+
+	ifm = nlmsg_data(nlh);
+	if (ifm->ifi_index > 0) {
+		dev = __dev_get_by_index(net, ifm->ifi_index);
+	} else if (tb[IFLA_IFNAME]) {
+		char ifname[IFNAMSIZ];
+
+		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, ifname);
+	} else {
+		return -EINVAL;
+	}
+
+	if (!dev)
+		return -ENODEV;
+
+	if (!tb[IFLA_PROP_LIST])
+		return 0;
+
+	nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) {
+		switch (nla_type(attr)) {
+		case IFLA_ALT_IFNAME:
+			err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack);
+			if (err)
+				return err;
+			break;
+		}
+	}
+
+	if (changed)
+		netdev_state_change(dev);
+	return 0;
+}
+
+static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct netlink_ext_ack *extack)
+{
+	return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack);
+}
+
+static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
+			    struct netlink_ext_ack *extack)
+{
+	return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
+}
+
 static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
@@ -5331,6 +5431,9 @@ void __init rtnetlink_init(void)
 	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);
 
+	rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0);
+
 	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
 	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0);
 	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0);
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 58345ba0528e..c97fdae8f71b 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -83,6 +83,8 @@ static const struct nlmsg_perm nlmsg_route_perms[] =
 	{ RTM_NEWNEXTHOP,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_DELNEXTHOP,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 	{ RTM_GETNEXTHOP,	NETLINK_ROUTE_SOCKET__NLMSG_READ  },
+	{ RTM_NEWLINKPROP,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+	{ RTM_DELLINKPROP,	NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
 };
 
 static const struct nlmsg_perm nlmsg_tcpdiag_perms[] =
@@ -166,7 +168,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 		 * structures at the top of this file with the new mappings
 		 * before updating the BUILD_BUG_ON() macro!
 		 */
-		BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOP + 3));
+		BUILD_BUG_ON(RTM_MAX != (RTM_NEWLINKPROP + 3));
 		err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
 				 sizeof(nlmsg_route_perms));
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From afa0df5998131153ec3036f41e76ece33bf1334f Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 30 Sep 2019 10:15:09 +0200
Subject: net: push loops and nb calls into helper functions

Push iterations over net namespaces and netdevices from
register_netdevice_notifier() and unregister_netdevice_notifier()
into helper functions. Along with that introduce continue_reverse macros
to make the code a bit nicer allowing to get rid of "last" marks.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h   |  3 ++
 include/net/net_namespace.h |  3 +-
 net/core/dev.c              | 89 +++++++++++++++++++++++++++++++--------------
 3 files changed, 66 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 48cc71aae466..7b183f724fc4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2574,6 +2574,9 @@ extern rwlock_t				dev_base_lock;		/* Device list lock */
 		list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_continue(net, d)		\
 		list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
+#define for_each_netdev_continue_reverse(net, d)		\
+		list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \
+						     dev_list)
 #define for_each_netdev_continue_rcu(net, d)		\
 	list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_in_bond_rcu(bond, slave)	\
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f8712bbeb2e0..c5a98e03591d 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -317,7 +317,8 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 /* Protected by net_rwsem */
 #define for_each_net(VAR)				\
 	list_for_each_entry(VAR, &net_namespace_list, list)
-
+#define for_each_net_continue_reverse(VAR)		\
+	list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list)
 #define for_each_net_rcu(VAR)				\
 	list_for_each_entry_rcu(VAR, &net_namespace_list, list)
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 7a456c6a7ad8..a8b70cb6c732 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1725,6 +1725,62 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 	return nb->notifier_call(nb, val, &info);
 }
 
+static int call_netdevice_register_notifiers(struct notifier_block *nb,
+					     struct net_device *dev)
+{
+	int err;
+
+	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
+	err = notifier_to_errno(err);
+	if (err)
+		return err;
+
+	if (!(dev->flags & IFF_UP))
+		return 0;
+
+	call_netdevice_notifier(nb, NETDEV_UP, dev);
+	return 0;
+}
+
+static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
+						struct net_device *dev)
+{
+	if (dev->flags & IFF_UP) {
+		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+					dev);
+		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+	}
+	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+}
+
+static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
+						 struct net *net)
+{
+	struct net_device *dev;
+	int err;
+
+	for_each_netdev(net, dev) {
+		err = call_netdevice_register_notifiers(nb, dev);
+		if (err)
+			goto rollback;
+	}
+	return 0;
+
+rollback:
+	for_each_netdev_continue_reverse(net, dev)
+		call_netdevice_unregister_notifiers(nb, dev);
+	return err;
+}
+
+static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
+						    struct net *net)
+{
+	struct net_device *dev;
+
+	for_each_netdev(net, dev)
+		call_netdevice_unregister_notifiers(nb, dev);
+}
+
 static int dev_boot_phase = 1;
 
 /**
@@ -1743,8 +1799,6 @@ static int dev_boot_phase = 1;
 
 int register_netdevice_notifier(struct notifier_block *nb)
 {
-	struct net_device *dev;
-	struct net_device *last;
 	struct net *net;
 	int err;
 
@@ -1757,17 +1811,9 @@ int register_netdevice_notifier(struct notifier_block *nb)
 	if (dev_boot_phase)
 		goto unlock;
 	for_each_net(net) {
-		for_each_netdev(net, dev) {
-			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
-			err = notifier_to_errno(err);
-			if (err)
-				goto rollback;
-
-			if (!(dev->flags & IFF_UP))
-				continue;
-
-			call_netdevice_notifier(nb, NETDEV_UP, dev);
-		}
+		err = call_netdevice_register_net_notifiers(nb, net);
+		if (err)
+			goto rollback;
 	}
 
 unlock:
@@ -1776,22 +1822,9 @@ unlock:
 	return err;
 
 rollback:
-	last = dev;
-	for_each_net(net) {
-		for_each_netdev(net, dev) {
-			if (dev == last)
-				goto outroll;
-
-			if (dev->flags & IFF_UP) {
-				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
-							dev);
-				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
-			}
-			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
-		}
-	}
+	for_each_net_continue_reverse(net)
+		call_netdevice_unregister_net_notifiers(nb, net);
 
-outroll:
 	raw_notifier_chain_unregister(&netdev_chain, nb);
 	goto unlock;
 }
-- 
cgit v1.2.3-59-g8ed1b


From a30c7b429f2dd980202c912fcb76442364937b4d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 30 Sep 2019 10:15:10 +0200
Subject: net: introduce per-netns netdevice notifiers

Often the code for example in drivers is interested in getting notifier
call only from certain network namespace. In addition to the existing
global netdevice notifier chain introduce per-netns chains and allow
users to register to that. Eventually this would eliminate unnecessary
overhead in case there are many netdevices in many network namespaces.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h   |  3 ++
 include/net/net_namespace.h |  3 ++
 net/core/dev.c              | 87 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7b183f724fc4..fe45b2c72315 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2504,6 +2504,9 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd);
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
+int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb);
+int unregister_netdevice_notifier_net(struct net *net,
+				      struct notifier_block *nb);
 
 struct netdev_notifier_info {
 	struct net_device	*dev;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index c5a98e03591d..5ac2bb16d4b3 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -36,6 +36,7 @@
 #include <linux/ns_common.h>
 #include <linux/idr.h>
 #include <linux/skbuff.h>
+#include <linux/notifier.h>
 
 struct user_namespace;
 struct proc_dir_entry;
@@ -96,6 +97,8 @@ struct net {
 	struct list_head 	dev_base_head;
 	struct hlist_head 	*dev_name_head;
 	struct hlist_head	*dev_index_head;
+	struct raw_notifier_head	netdev_chain;
+
 	unsigned int		dev_base_seq;	/* protected by rtnl_mutex */
 	int			ifindex;
 	unsigned int		dev_unreg_count;
diff --git a/net/core/dev.c b/net/core/dev.c
index a8b70cb6c732..c680225e0da8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1874,6 +1874,80 @@ unlock:
 }
 EXPORT_SYMBOL(unregister_netdevice_notifier);
 
+/**
+ * register_netdevice_notifier_net - register a per-netns network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ *
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
+ */
+
+int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
+{
+	int err;
+
+	rtnl_lock();
+	err = raw_notifier_chain_register(&net->netdev_chain, nb);
+	if (err)
+		goto unlock;
+	if (dev_boot_phase)
+		goto unlock;
+
+	err = call_netdevice_register_net_notifiers(nb, net);
+	if (err)
+		goto chain_unregister;
+
+unlock:
+	rtnl_unlock();
+	return err;
+
+chain_unregister:
+	raw_notifier_chain_unregister(&netdev_chain, nb);
+	goto unlock;
+}
+EXPORT_SYMBOL(register_netdevice_notifier_net);
+
+/**
+ * unregister_netdevice_notifier_net - unregister a per-netns
+ *                                     network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier(). The notifier is unlinked into the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ *
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
+ */
+
+int unregister_netdevice_notifier_net(struct net *net,
+				      struct notifier_block *nb)
+{
+	int err;
+
+	rtnl_lock();
+	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
+	if (err)
+		goto unlock;
+
+	call_netdevice_unregister_net_notifiers(nb, net);
+
+unlock:
+	rtnl_unlock();
+	return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier_net);
+
 /**
  *	call_netdevice_notifiers_info - call all network notifier blocks
  *	@val: value passed unmodified to notifier function
@@ -1886,7 +1960,18 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
 static int call_netdevice_notifiers_info(unsigned long val,
 					 struct netdev_notifier_info *info)
 {
+	struct net *net = dev_net(info->dev);
+	int ret;
+
 	ASSERT_RTNL();
+
+	/* Run per-netns notifier block chain first, then run the global one.
+	 * Hopefully, one day, the global one is going to be removed after
+	 * all notifier block registrators get converted to be per-netns.
+	 */
+	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
+	if (ret & NOTIFY_STOP_MASK)
+		return ret;
 	return raw_notifier_call_chain(&netdev_chain, val, info);
 }
 
@@ -9785,6 +9870,8 @@ static int __net_init netdev_init(struct net *net)
 	if (net->dev_index_head == NULL)
 		goto err_idx;
 
+	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
+
 	return 0;
 
 err_idx:
-- 
cgit v1.2.3-59-g8ed1b


From 37048e94a2dc81a5a259963117f62341e25161f7 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Tue, 1 Oct 2019 22:12:50 +0300
Subject: net: dsa: Remove unused __DSA_SKB_CB macro

The struct __dsa_skb_cb is supposed to span the entire 48-byte skb
control block, while the struct dsa_skb_cb only the portion of it which
is used by the DSA core (the rest is available as private data to
drivers).

The DSA_SKB_CB and __DSA_SKB_CB helpers are supposed to help retrieve
this pointer based on a skb, but it turns out there is nobody directly
interested in the struct __dsa_skb_cb in the kernel. So remove it.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 541fb514e31d..8c3ea0530f65 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -94,8 +94,6 @@ struct __dsa_skb_cb {
 	u8 priv[48 - sizeof(struct dsa_skb_cb)];
 };
 
-#define __DSA_SKB_CB(skb) ((struct __dsa_skb_cb *)((skb)->cb))
-
 #define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb))
 
 #define DSA_SKB_CB_PRIV(skb)			\
-- 
cgit v1.2.3-59-g8ed1b


From 968a2978cb39a754750d35a47049781660682a31 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 2 Oct 2019 16:52:57 +0200
Subject: net: stmmac: Only enable enhanced addressing mode when needed

Enhanced addressing mode is only required when more than 32 bits need to
be addressed. Add a DMA configuration parameter to enable this mode only
when needed.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c | 5 ++++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 7 +++++++
 include/linux/stmmac.h                             | 1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
index 53c4a40d8386..19ef037393dd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_dma.c
@@ -27,7 +27,10 @@ static void dwxgmac2_dma_init(void __iomem *ioaddr,
 	if (dma_cfg->aal)
 		value |= XGMAC_AAL;
 
-	writel(value | XGMAC_EAME, ioaddr + XGMAC_DMA_SYSBUS_MODE);
+	if (dma_cfg->eame)
+		value |= XGMAC_EAME;
+
+	writel(value, ioaddr + XGMAC_DMA_SYSBUS_MODE);
 }
 
 static void dwxgmac2_dma_init_chan(void __iomem *ioaddr,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index d3232738fb25..2da1f77bb604 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4514,6 +4514,13 @@ int stmmac_dvr_probe(struct device *device,
 		if (!ret) {
 			dev_info(priv->device, "Using %d bits DMA width\n",
 				 priv->dma_cap.addr64);
+
+			/*
+			 * If more than 32 bits can be addressed, make sure to
+			 * enable enhanced addressing mode.
+			 */
+			if (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT))
+				priv->plat->dma_cfg->eame = true;
 		} else {
 			ret = dma_set_mask_and_coherent(device, DMA_BIT_MASK(32));
 			if (ret) {
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index dc60d03c4b60..86f9464c3f5d 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -92,6 +92,7 @@ struct stmmac_dma_cfg {
 	int fixed_burst;
 	int mixed_burst;
 	bool aal;
+	bool eame;
 };
 
 #define AXI_BLEN	7
-- 
cgit v1.2.3-59-g8ed1b


From 4b76f9ed47074990d85c2ec52288a7d09c3ea357 Mon Sep 17 00:00:00 2001
From: Sunil Dutt <usdutt@codeaurora.org>
Date: Fri, 13 Sep 2019 18:11:44 +0530
Subject: nl80211: Document the expectation for NL80211_ATTR_IE in
 NL80211_CMD_CONNECT

This commit documents the expectation for NL80211_ATTR_IE when included
in NL80211_CMD_CONNECT, as following.

Driver shall not modify the IEs specified through NL80211_ATTR_IE if
NL80211_ATTR_MAC is included. However, if NL80211_ATTR_MAC_HINT is
included, these IEs through NL80211_ATTR_IE are specified by the user
space based on the best possible BSS selected. Thus, if the driver ends
up selecting a different BSS, it can modify these IEs accordingly (e.g.
userspace asks the driver to perform PMKSA caching with BSS1 and the
driver ends up selecting BSS2 with different PMKSA cache entry. RSNIE
has to get updated with the apt PMKID).

Signed-off-by: Sunil Dutt <usdutt@codeaurora.org>
Link: https://lore.kernel.org/r/1568378504-15179-1-git-send-email-usdutt@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index beee59c831a7..64135ab3a7ac 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -571,6 +571,14 @@
  *	set of BSSID,frequency parameters is used (i.e., either the enforcing
  *	%NL80211_ATTR_MAC,%NL80211_ATTR_WIPHY_FREQ or the less strict
  *	%NL80211_ATTR_MAC_HINT and %NL80211_ATTR_WIPHY_FREQ_HINT).
+ *	Driver shall not modify the IEs specified through %NL80211_ATTR_IE if
+ *	%NL80211_ATTR_MAC is included. However, if %NL80211_ATTR_MAC_HINT is
+ *	included, these IEs through %NL80211_ATTR_IE are specified by the user
+ *	space based on the best possible BSS selected. Thus, if the driver ends
+ *	up selecting a different BSS, it can modify these IEs accordingly (e.g.
+ *	userspace asks the driver to perform PMKSA caching with BSS1 and the
+ *	driver ends up selecting BSS2 with different PMKSA cache entry; RSNIE
+ *	has to get updated with the apt PMKID).
  *	%NL80211_ATTR_PREV_BSSID can be used to request a reassociation within
  *	the ESS in case the device is already associated and an association with
  *	a different BSS is desired.
-- 
cgit v1.2.3-59-g8ed1b


From 2ce113de31320756b25179f3f4512a522bc45263 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 2 Oct 2019 11:12:25 +0200
Subject: mac80211: simplify TX aggregation start

There really is no need to make drivers call the
ieee80211_start_tx_ba_cb_irqsafe() function and then
schedule the worker if all we want is to set a bit.

Add a new return value (that was previously considered
invalid) to indicate that the driver is immediately
ready for the session, and make drivers use it. The
only drivers that remain different are the Intel ones
as they need to negotiate more with the firmware.

Link: https://lore.kernel.org/r/1570007543-I152912660131cbab2e5d80b4218238c20f8a06e5@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath9k/htc_drv_main.c                 |  2 +-
 drivers/net/wireless/ath/ath9k/main.c                         |  2 +-
 drivers/net/wireless/ath/carl9170/main.c                      |  3 +--
 drivers/net/wireless/ath/wcn36xx/main.c                       |  5 +++--
 .../net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c    |  3 +--
 drivers/net/wireless/intel/iwlegacy/4965-mac.c                |  2 +-
 drivers/net/wireless/intel/iwlwifi/dvm/tx.c                   |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/sta.c                  |  5 ++---
 drivers/net/wireless/mac80211_hwsim.c                         |  3 +--
 drivers/net/wireless/marvell/mwl8k.c                          |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7603/main.c              |  3 +--
 drivers/net/wireless/mediatek/mt76/mt7615/main.c              |  3 +--
 drivers/net/wireless/mediatek/mt76/mt76x02_util.c             |  3 +--
 drivers/net/wireless/mediatek/mt7601u/main.c                  |  3 +--
 drivers/net/wireless/ralink/rt2x00/rt2800lib.c                |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/base.c                   |  3 +--
 drivers/net/wireless/realtek/rtw88/mac80211.c                 |  3 +--
 drivers/net/wireless/rsi/rsi_91x_mac80211.c                   |  3 +--
 include/net/mac80211.h                                        | 11 +++++++++--
 net/mac80211/agg-tx.c                                         |  9 ++++++++-
 20 files changed, 39 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index a82ad739ab80..791f6633667c 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
@@ -1674,7 +1674,7 @@ static int ath9k_htc_ampdu_action(struct ieee80211_hw *hw,
 	case IEEE80211_AMPDU_TX_START:
 		ret = ath9k_htc_tx_aggr_oper(priv, vif, sta, action, tid);
 		if (!ret)
-			ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
+			ret = IEEE80211_AMPDU_TX_START_IMMEDIATE;
 		break;
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 34121fbf32e3..0548aa3702e3 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1921,7 +1921,7 @@ static int ath9k_ampdu_action(struct ieee80211_hw *hw,
 		ath9k_ps_wakeup(sc);
 		ret = ath_tx_aggr_start(sc, sta, tid, ssn);
 		if (!ret)
-			ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
+			ret = IEEE80211_AMPDU_TX_START_IMMEDIATE;
 		ath9k_ps_restore(sc);
 		break;
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index 40a8054f8aa6..5914926a5c5b 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -1449,8 +1449,7 @@ static int carl9170_op_ampdu_action(struct ieee80211_hw *hw,
 		rcu_assign_pointer(sta_info->agg[tid], tid_info);
 		spin_unlock_bh(&ar->tx_ampdu_list_lock);
 
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-		break;
+		return IEEE80211_AMPDU_TX_START_IMMEDIATE;
 
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 79998a3ddb7a..a276dae30887 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -1084,6 +1084,7 @@ static int wcn36xx_ampdu_action(struct ieee80211_hw *hw,
 	enum ieee80211_ampdu_mlme_action action = params->action;
 	u16 tid = params->tid;
 	u16 *ssn = &params->ssn;
+	int ret = 0;
 
 	wcn36xx_dbg(WCN36XX_DBG_MAC, "mac ampdu action action %d tid %d\n",
 		    action, tid);
@@ -1106,7 +1107,7 @@ static int wcn36xx_ampdu_action(struct ieee80211_hw *hw,
 		sta_priv->ampdu_state[tid] = WCN36XX_AMPDU_START;
 		spin_unlock_bh(&sta_priv->ampdu_lock);
 
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
+		ret = IEEE80211_AMPDU_TX_START_IMMEDIATE;
 		break;
 	case IEEE80211_AMPDU_TX_OPERATIONAL:
 		spin_lock_bh(&sta_priv->ampdu_lock);
@@ -1131,7 +1132,7 @@ static int wcn36xx_ampdu_action(struct ieee80211_hw *hw,
 
 	mutex_unlock(&wcn->conf_mutex);
 
-	return 0;
+	return ret;
 }
 
 static const struct ieee80211_ops wcn36xx_ops = {
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index 6188275b17e5..8e8b685cfe09 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -850,8 +850,7 @@ brcms_ops_ampdu_action(struct ieee80211_hw *hw,
 				     "START: tid %d is not agg\'able\n", tid);
 			return -EINVAL;
 		}
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-		break;
+		return IEEE80211_AMPDU_TX_START_IMMEDIATE;
 
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
index ffb705b18fb1..51fdd7ce30af 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
@@ -2265,7 +2265,7 @@ il4965_tx_agg_start(struct il_priv *il, struct ieee80211_vif *vif,
 	if (tid_data->tfds_in_queue == 0) {
 		D_HT("HW queue is empty\n");
 		tid_data->agg.state = IL_AGG_ON;
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
+		ret = IEEE80211_AMPDU_TX_START_IMMEDIATE;
 	} else {
 		D_HT("HW queue is NOT empty: %d packets in HW queue\n",
 		     tid_data->tfds_in_queue);
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/tx.c b/drivers/net/wireless/intel/iwlwifi/dvm/tx.c
index 3029e3f6de63..cd73fc5cfcbb 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/tx.c
@@ -621,7 +621,7 @@ int iwlagn_tx_agg_start(struct iwl_priv *priv, struct ieee80211_vif *vif,
 		IWL_DEBUG_TX_QUEUES(priv, "Can proceed: ssn = next_recl = %d\n",
 				    tid_data->agg.ssn);
 		tid_data->agg.state = IWL_AGG_STARTING;
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
+		ret = IEEE80211_AMPDU_TX_START_IMMEDIATE;
 	} else {
 		IWL_DEBUG_TX_QUEUES(priv, "Can't proceed: ssn %d, "
 				    "next_reclaimed = %d\n",
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
index 0bedba4c61f2..1d6bc62b104c 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
@@ -2818,13 +2818,12 @@ int iwl_mvm_sta_tx_agg_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 
 	if (normalized_ssn == tid_data->next_reclaimed) {
 		tid_data->state = IWL_AGG_STARTING;
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
+		ret = IEEE80211_AMPDU_TX_START_IMMEDIATE;
 	} else {
 		tid_data->state = IWL_EMPTYING_HW_QUEUE_ADDBA;
+		ret = 0;
 	}
 
-	ret = 0;
-
 out:
 	spin_unlock_bh(&mvmsta->lock);
 
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 635956024e88..1aeb38296ec3 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1979,8 +1979,7 @@ static int mac80211_hwsim_ampdu_action(struct ieee80211_hw *hw,
 
 	switch (action) {
 	case IEEE80211_AMPDU_TX_START:
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-		break;
+		return IEEE80211_AMPDU_TX_START_IMMEDIATE;
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH_CONT:
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index c4db6417748f..d55f229abeea 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -5520,7 +5520,7 @@ mwl8k_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			rc = -EBUSY;
 			break;
 		}
-		ieee80211_start_tx_ba_cb_irqsafe(vif, addr, tid);
+		rc = IEEE80211_AMPDU_TX_START_IMMEDIATE;
 		break;
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
index 25d5b1608bc9..4b3217b43a04 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
@@ -582,8 +582,7 @@ mt7603_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		break;
 	case IEEE80211_AMPDU_TX_START:
 		mtxq->agg_ssn = IEEE80211_SN_TO_SEQ(ssn);
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-		break;
+		return IEEE80211_AMPDU_TX_START_IMMEDIATE;
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 		mtxq->aggr = false;
 		mt7603_mac_tx_ba_reset(dev, msta->wcid.idx, tid, -1);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
index 87c748715b5d..b6d78212306a 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
@@ -477,8 +477,7 @@ mt7615_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		break;
 	case IEEE80211_AMPDU_TX_START:
 		mtxq->agg_ssn = IEEE80211_SN_TO_SEQ(ssn);
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-		break;
+		return IEEE80211_AMPDU_TX_START_IMMEDIATE;
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 		mtxq->aggr = false;
 		mt7615_mcu_set_tx_ba(dev, params, 0);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
index aec73a0295e8..414b22399d93 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
@@ -393,8 +393,7 @@ int mt76x02_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		break;
 	case IEEE80211_AMPDU_TX_START:
 		mtxq->agg_ssn = IEEE80211_SN_TO_SEQ(ssn);
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-		break;
+		return IEEE80211_AMPDU_TX_START_IMMEDIATE;
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 		mtxq->aggr = false;
 		ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid);
diff --git a/drivers/net/wireless/mediatek/mt7601u/main.c b/drivers/net/wireless/mediatek/mt7601u/main.c
index 72e608cc53af..671d8897ae76 100644
--- a/drivers/net/wireless/mediatek/mt7601u/main.c
+++ b/drivers/net/wireless/mediatek/mt7601u/main.c
@@ -372,8 +372,7 @@ mt76_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		break;
 	case IEEE80211_AMPDU_TX_START:
 		msta->agg_ssn[tid] = ssn << 4;
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-		break;
+		return IEEE80211_AMPDU_TX_START_IMMEDIATE;
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 		ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid);
 		break;
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
index f1cdcd61c54a..25466454b73e 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
@@ -10476,7 +10476,7 @@ int rt2800_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	 * when the hw reorders frames due to aggregation.
 	 */
 	if (sta_priv->wcid > WCID_END)
-		return 1;
+		return -ENOSPC;
 
 	switch (action) {
 	case IEEE80211_AMPDU_RX_START:
@@ -10489,7 +10489,7 @@ int rt2800_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		 */
 		break;
 	case IEEE80211_AMPDU_TX_START:
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
+		ret = IEEE80211_AMPDU_TX_START_IMMEDIATE;
 		break;
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c
index ac746c322554..c75192c4447f 100644
--- a/drivers/net/wireless/realtek/rtlwifi/base.c
+++ b/drivers/net/wireless/realtek/rtlwifi/base.c
@@ -1776,8 +1776,7 @@ int rtl_tx_agg_start(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 
 	tid_data->agg.agg_state = RTL_AGG_START;
 
-	ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-	return 0;
+	return IEEE80211_AMPDU_TX_START_IMMEDIATE;
 }
 
 int rtl_tx_agg_stop(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c
index e5e3605bb693..a203b4705b94 100644
--- a/drivers/net/wireless/realtek/rtw88/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw88/mac80211.c
@@ -437,8 +437,7 @@ static int rtw_ops_ampdu_action(struct ieee80211_hw *hw,
 
 	switch (params->action) {
 	case IEEE80211_AMPDU_TX_START:
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-		break;
+		return IEEE80211_AMPDU_TX_START_IMMEDIATE;
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH_CONT:
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index ce5e92d82efc..440088293aff 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -1140,8 +1140,7 @@ static int rsi_mac80211_ampdu_action(struct ieee80211_hw *hw,
 		else if ((vif->type == NL80211_IFTYPE_AP) ||
 			 (vif->type == NL80211_IFTYPE_P2P_GO))
 			rsta->seq_start[tid] = seq_no;
-		ieee80211_start_tx_ba_cb_irqsafe(vif, sta->addr, tid);
-		status = 0;
+		status = IEEE80211_AMPDU_TX_START_IMMEDIATE;
 		break;
 
 	case IEEE80211_AMPDU_TX_STOP_CONT:
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 523c6a09e1c8..d69081c38788 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3095,7 +3095,9 @@ enum ieee80211_filter_flags {
  *
  * @IEEE80211_AMPDU_RX_START: start RX aggregation
  * @IEEE80211_AMPDU_RX_STOP: stop RX aggregation
- * @IEEE80211_AMPDU_TX_START: start TX aggregation
+ * @IEEE80211_AMPDU_TX_START: start TX aggregation, the driver must either
+ *	call ieee80211_start_tx_ba_cb_irqsafe() or return the special
+ *	status %IEEE80211_AMPDU_TX_START_IMMEDIATE.
  * @IEEE80211_AMPDU_TX_OPERATIONAL: TX aggregation has become operational
  * @IEEE80211_AMPDU_TX_STOP_CONT: stop TX aggregation but continue transmitting
  *	queued packets, now unaggregated. After all packets are transmitted the
@@ -3119,6 +3121,8 @@ enum ieee80211_ampdu_mlme_action {
 	IEEE80211_AMPDU_TX_OPERATIONAL,
 };
 
+#define IEEE80211_AMPDU_TX_START_IMMEDIATE 1
+
 /**
  * struct ieee80211_ampdu_params - AMPDU action parameters
  *
@@ -3896,7 +3900,10 @@ struct ieee80211_ops {
 	 *
 	 * Even ``189`` would be wrong since 1 could be lost again.
 	 *
-	 * Returns a negative error code on failure.
+	 * Returns a negative error code on failure. The driver may return
+	 * %IEEE80211_AMPDU_TX_START_IMMEDIATE for %IEEE80211_AMPDU_TX_START
+	 * if the session can start immediately.
+	 *
 	 * The callback can sleep.
 	 */
 	int (*ampdu_action)(struct ieee80211_hw *hw,
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index b11883d26875..33da6f738c99 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -485,7 +485,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 
 	params.ssn = sta->tid_seq[tid] >> 4;
 	ret = drv_ampdu_action(local, sdata, &params);
-	if (ret) {
+	if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) {
+		/*
+		 * We didn't send the request yet, so don't need to check
+		 * here if we already got a response, just mark as driver
+		 * ready immediately.
+		 */
+		set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state);
+	} else if (ret) {
 		ht_dbg(sdata,
 		       "BA request denied - HW unavailable for %pM tid %d\n",
 		       sta->sta.addr, tid);
-- 
cgit v1.2.3-59-g8ed1b


From 7c550daffe22a97282effa75fe7c1f6b83563ecb Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Oct 2019 11:49:27 +0200
Subject: net: fib_notifier: make FIB notifier per-netns

Currently all users of FIB notifier only cares about events in init_net.
Later in this patchset, users get interested in other namespaces too.
However, for every registered block user is interested only about one
namespace. Make the FIB notifier registration per-netns and avoid
unnecessary calls of notifier block for other namespaces.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c   |  7 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |  9 +--
 drivers/net/ethernet/rocker/rocker_main.c          |  9 +--
 drivers/net/netdevsim/fib.c                        |  8 +-
 include/linux/mroute_base.h                        | 10 +--
 include/net/fib_notifier.h                         |  7 +-
 include/net/ip6_fib.h                              |  2 +-
 include/net/ip_fib.h                               |  2 +-
 net/core/fib_notifier.c                            | 87 ++++++++++------------
 net/core/fib_rules.c                               |  7 +-
 net/ipv4/fib_notifier.c                            |  4 +-
 net/ipv4/fib_trie.c                                | 17 ++---
 net/ipv4/ipmr_base.c                               |  4 +-
 net/ipv6/fib6_notifier.c                           |  4 +-
 net/ipv6/ip6_fib.c                                 |  6 +-
 15 files changed, 78 insertions(+), 105 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
index 5d20d615663e..fe0cc969cf94 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
@@ -248,9 +248,6 @@ static int mlx5_lag_fib_event(struct notifier_block *nb,
 	struct net_device *fib_dev;
 	struct fib_info *fi;
 
-	if (!net_eq(info->net, &init_net))
-		return NOTIFY_DONE;
-
 	if (info->family != AF_INET)
 		return NOTIFY_DONE;
 
@@ -311,7 +308,7 @@ int mlx5_lag_mp_init(struct mlx5_lag *ldev)
 		return 0;
 
 	mp->fib_nb.notifier_call = mlx5_lag_fib_event;
-	err = register_fib_notifier(&mp->fib_nb,
+	err = register_fib_notifier(&init_net, &mp->fib_nb,
 				    mlx5_lag_fib_event_flush);
 	if (err)
 		mp->fib_nb.notifier_call = NULL;
@@ -326,6 +323,6 @@ void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev)
 	if (!mp->fib_nb.notifier_call)
 		return;
 
-	unregister_fib_notifier(&mp->fib_nb);
+	unregister_fib_notifier(&init_net, &mp->fib_nb);
 	mp->fib_nb.notifier_call = NULL;
 }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index a330b369e899..d0db9ea71323 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -6213,7 +6213,7 @@ static int mlxsw_sp_router_fib_rule_event(unsigned long event,
 	rule = fr_info->rule;
 
 	/* Rule only affects locally generated traffic */
-	if (rule->iifindex == info->net->loopback_dev->ifindex)
+	if (rule->iifindex == init_net.loopback_dev->ifindex)
 		return 0;
 
 	switch (info->family) {
@@ -6250,8 +6250,7 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
 	struct mlxsw_sp_router *router;
 	int err;
 
-	if (!net_eq(info->net, &init_net) ||
-	    (info->family != AF_INET && info->family != AF_INET6 &&
+	if ((info->family != AF_INET && info->family != AF_INET6 &&
 	     info->family != RTNL_FAMILY_IPMR &&
 	     info->family != RTNL_FAMILY_IP6MR))
 		return NOTIFY_DONE;
@@ -8155,7 +8154,7 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
 		goto err_dscp_init;
 
 	mlxsw_sp->router->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
-	err = register_fib_notifier(&mlxsw_sp->router->fib_nb,
+	err = register_fib_notifier(&init_net, &mlxsw_sp->router->fib_nb,
 				    mlxsw_sp_router_fib_dump_flush);
 	if (err)
 		goto err_register_fib_notifier;
@@ -8195,7 +8194,7 @@ err_register_inetaddr_notifier:
 
 void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
 {
-	unregister_fib_notifier(&mlxsw_sp->router->fib_nb);
+	unregister_fib_notifier(&init_net, &mlxsw_sp->router->fib_nb);
 	unregister_netevent_notifier(&mlxsw_sp->router->netevent_nb);
 	mlxsw_sp_neigh_fini(mlxsw_sp);
 	mlxsw_sp_vrs_fini(mlxsw_sp);
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 786b158bd305..e54f6341a785 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2189,9 +2189,6 @@ static int rocker_router_fib_event(struct notifier_block *nb,
 	struct rocker_fib_event_work *fib_work;
 	struct fib_notifier_info *info = ptr;
 
-	if (!net_eq(info->net, &init_net))
-		return NOTIFY_DONE;
-
 	if (info->family != AF_INET)
 		return NOTIFY_DONE;
 
@@ -2994,7 +2991,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	 * the device, so no need to pass a callback.
 	 */
 	rocker->fib_nb.notifier_call = rocker_router_fib_event;
-	err = register_fib_notifier(&rocker->fib_nb, NULL);
+	err = register_fib_notifier(&init_net, &rocker->fib_nb, NULL);
 	if (err)
 		goto err_register_fib_notifier;
 
@@ -3021,7 +3018,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 err_register_switchdev_blocking_notifier:
 	unregister_switchdev_notifier(&rocker_switchdev_notifier);
 err_register_switchdev_notifier:
-	unregister_fib_notifier(&rocker->fib_nb);
+	unregister_fib_notifier(&init_net, &rocker->fib_nb);
 err_register_fib_notifier:
 	rocker_remove_ports(rocker);
 err_probe_ports:
@@ -3057,7 +3054,7 @@ static void rocker_remove(struct pci_dev *pdev)
 	unregister_switchdev_blocking_notifier(nb);
 
 	unregister_switchdev_notifier(&rocker_switchdev_notifier);
-	unregister_fib_notifier(&rocker->fib_nb);
+	unregister_fib_notifier(&init_net, &rocker->fib_nb);
 	rocker_remove_ports(rocker);
 	rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
 	destroy_workqueue(rocker->rocker_owq);
diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c
index 7de17e42d77a..01ee9cc54605 100644
--- a/drivers/net/netdevsim/fib.c
+++ b/drivers/net/netdevsim/fib.c
@@ -182,9 +182,6 @@ static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event,
 	struct fib_notifier_info *info = ptr;
 	int err = 0;
 
-	if (!net_eq(info->net, &init_net))
-		return NOTIFY_DONE;
-
 	switch (event) {
 	case FIB_EVENT_RULE_ADD: /* fall through */
 	case FIB_EVENT_RULE_DEL:
@@ -258,7 +255,8 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink)
 	data->ipv6.rules.max = (u64)-1;
 
 	data->fib_nb.notifier_call = nsim_fib_event_nb;
-	err = register_fib_notifier(&data->fib_nb, nsim_fib_dump_inconsistent);
+	err = register_fib_notifier(&init_net, &data->fib_nb,
+				    nsim_fib_dump_inconsistent);
 	if (err) {
 		pr_err("Failed to register fib notifier\n");
 		goto err_out;
@@ -297,6 +295,6 @@ void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data)
 					    NSIM_RESOURCE_IPV4_FIB_RULES);
 	devlink_resource_occ_get_unregister(devlink,
 					    NSIM_RESOURCE_IPV4_FIB);
-	unregister_fib_notifier(&data->fib_nb);
+	unregister_fib_notifier(&init_net, &data->fib_nb);
 	kfree(data);
 }
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 34de06b426ef..0931631bbc13 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -47,7 +47,6 @@ struct vif_entry_notifier_info {
 };
 
 static inline int mr_call_vif_notifier(struct notifier_block *nb,
-				       struct net *net,
 				       unsigned short family,
 				       enum fib_event_type event_type,
 				       struct vif_device *vif,
@@ -56,7 +55,6 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb,
 	struct vif_entry_notifier_info info = {
 		.info = {
 			.family = family,
-			.net = net,
 		},
 		.dev = vif->dev,
 		.vif_index = vif_index,
@@ -64,7 +62,7 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb,
 		.tb_id = tb_id,
 	};
 
-	return call_fib_notifier(nb, net, event_type, &info.info);
+	return call_fib_notifier(nb, event_type, &info.info);
 }
 
 static inline int mr_call_vif_notifiers(struct net *net,
@@ -77,7 +75,6 @@ static inline int mr_call_vif_notifiers(struct net *net,
 	struct vif_entry_notifier_info info = {
 		.info = {
 			.family = family,
-			.net = net,
 		},
 		.dev = vif->dev,
 		.vif_index = vif_index,
@@ -173,7 +170,6 @@ struct mfc_entry_notifier_info {
 };
 
 static inline int mr_call_mfc_notifier(struct notifier_block *nb,
-				       struct net *net,
 				       unsigned short family,
 				       enum fib_event_type event_type,
 				       struct mr_mfc *mfc, u32 tb_id)
@@ -181,13 +177,12 @@ static inline int mr_call_mfc_notifier(struct notifier_block *nb,
 	struct mfc_entry_notifier_info info = {
 		.info = {
 			.family = family,
-			.net = net,
 		},
 		.mfc = mfc,
 		.tb_id = tb_id
 	};
 
-	return call_fib_notifier(nb, net, event_type, &info.info);
+	return call_fib_notifier(nb, event_type, &info.info);
 }
 
 static inline int mr_call_mfc_notifiers(struct net *net,
@@ -199,7 +194,6 @@ static inline int mr_call_mfc_notifiers(struct net *net,
 	struct mfc_entry_notifier_info info = {
 		.info = {
 			.family = family,
-			.net = net,
 		},
 		.mfc = mfc,
 		.tb_id = tb_id
diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
index c49d7bfb5c30..23353f67b2b0 100644
--- a/include/net/fib_notifier.h
+++ b/include/net/fib_notifier.h
@@ -8,7 +8,6 @@
 struct module;
 
 struct fib_notifier_info {
-	struct net *net;
 	int family;
 	struct netlink_ext_ack  *extack;
 };
@@ -35,14 +34,14 @@ struct fib_notifier_ops {
 	struct rcu_head rcu;
 };
 
-int call_fib_notifier(struct notifier_block *nb, struct net *net,
+int call_fib_notifier(struct notifier_block *nb,
 		      enum fib_event_type event_type,
 		      struct fib_notifier_info *info);
 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 		       struct fib_notifier_info *info);
-int register_fib_notifier(struct notifier_block *nb,
+int register_fib_notifier(struct net *net, struct notifier_block *nb,
 			  void (*cb)(struct notifier_block *nb));
-int unregister_fib_notifier(struct notifier_block *nb);
+int unregister_fib_notifier(struct net *net, struct notifier_block *nb);
 struct fib_notifier_ops *
 fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net);
 void fib_notifier_ops_unregister(struct fib_notifier_ops *ops);
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 4b5656c71abc..14e9fca0e326 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -478,7 +478,7 @@ struct ipv6_route_iter {
 
 extern const struct seq_operations ipv6_route_seq_ops;
 
-int call_fib6_notifier(struct notifier_block *nb, struct net *net,
+int call_fib6_notifier(struct notifier_block *nb,
 		       enum fib_event_type event_type,
 		       struct fib_notifier_info *info);
 int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index ab1ca9e238d2..a9df85304f40 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -219,7 +219,7 @@ struct fib_nh_notifier_info {
 	struct fib_nh *fib_nh;
 };
 
-int call_fib4_notifier(struct notifier_block *nb, struct net *net,
+int call_fib4_notifier(struct notifier_block *nb,
 		       enum fib_event_type event_type,
 		       struct fib_notifier_info *info);
 int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 470a606d5e8d..fbd029425638 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -12,17 +12,15 @@ static unsigned int fib_notifier_net_id;
 
 struct fib_notifier_net {
 	struct list_head fib_notifier_ops;
+	struct atomic_notifier_head fib_chain;
 };
 
-static ATOMIC_NOTIFIER_HEAD(fib_chain);
-
-int call_fib_notifier(struct notifier_block *nb, struct net *net,
+int call_fib_notifier(struct notifier_block *nb,
 		      enum fib_event_type event_type,
 		      struct fib_notifier_info *info)
 {
 	int err;
 
-	info->net = net;
 	err = nb->notifier_call(nb, event_type, info);
 	return notifier_to_errno(err);
 }
@@ -31,35 +29,29 @@ EXPORT_SYMBOL(call_fib_notifier);
 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 		       struct fib_notifier_info *info)
 {
+	struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
 	int err;
 
-	info->net = net;
-	err = atomic_notifier_call_chain(&fib_chain, event_type, info);
+	err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info);
 	return notifier_to_errno(err);
 }
 EXPORT_SYMBOL(call_fib_notifiers);
 
-static unsigned int fib_seq_sum(void)
+static unsigned int fib_seq_sum(struct net *net)
 {
-	struct fib_notifier_net *fn_net;
+	struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
 	struct fib_notifier_ops *ops;
 	unsigned int fib_seq = 0;
-	struct net *net;
 
 	rtnl_lock();
-	down_read(&net_rwsem);
-	for_each_net(net) {
-		fn_net = net_generic(net, fib_notifier_net_id);
-		rcu_read_lock();
-		list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
-			if (!try_module_get(ops->owner))
-				continue;
-			fib_seq += ops->fib_seq_read(net);
-			module_put(ops->owner);
-		}
-		rcu_read_unlock();
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
+		if (!try_module_get(ops->owner))
+			continue;
+		fib_seq += ops->fib_seq_read(net);
+		module_put(ops->owner);
 	}
-	up_read(&net_rwsem);
+	rcu_read_unlock();
 	rtnl_unlock();
 
 	return fib_seq;
@@ -69,68 +61,66 @@ static int fib_net_dump(struct net *net, struct notifier_block *nb)
 {
 	struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
 	struct fib_notifier_ops *ops;
+	int err = 0;
 
+	rcu_read_lock();
 	list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
-		int err;
-
 		if (!try_module_get(ops->owner))
 			continue;
 		err = ops->fib_dump(net, nb);
 		module_put(ops->owner);
 		if (err)
-			return err;
+			goto unlock;
 	}
 
-	return 0;
+unlock:
+	rcu_read_unlock();
+
+	return err;
 }
 
-static bool fib_dump_is_consistent(struct notifier_block *nb,
+static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb,
 				   void (*cb)(struct notifier_block *nb),
 				   unsigned int fib_seq)
 {
-	atomic_notifier_chain_register(&fib_chain, nb);
-	if (fib_seq == fib_seq_sum())
+	struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+	atomic_notifier_chain_register(&fn_net->fib_chain, nb);
+	if (fib_seq == fib_seq_sum(net))
 		return true;
-	atomic_notifier_chain_unregister(&fib_chain, nb);
+	atomic_notifier_chain_unregister(&fn_net->fib_chain, nb);
 	if (cb)
 		cb(nb);
 	return false;
 }
 
 #define FIB_DUMP_MAX_RETRIES 5
-int register_fib_notifier(struct notifier_block *nb,
+int register_fib_notifier(struct net *net, struct notifier_block *nb,
 			  void (*cb)(struct notifier_block *nb))
 {
 	int retries = 0;
 	int err;
 
 	do {
-		unsigned int fib_seq = fib_seq_sum();
-		struct net *net;
-
-		rcu_read_lock();
-		for_each_net_rcu(net) {
-			err = fib_net_dump(net, nb);
-			if (err)
-				goto err_fib_net_dump;
-		}
-		rcu_read_unlock();
-
-		if (fib_dump_is_consistent(nb, cb, fib_seq))
+		unsigned int fib_seq = fib_seq_sum(net);
+
+		err = fib_net_dump(net, nb);
+		if (err)
+			return err;
+
+		if (fib_dump_is_consistent(net, nb, cb, fib_seq))
 			return 0;
 	} while (++retries < FIB_DUMP_MAX_RETRIES);
 
 	return -EBUSY;
-
-err_fib_net_dump:
-	rcu_read_unlock();
-	return err;
 }
 EXPORT_SYMBOL(register_fib_notifier);
 
-int unregister_fib_notifier(struct notifier_block *nb)
+int unregister_fib_notifier(struct net *net, struct notifier_block *nb)
 {
-	return atomic_notifier_chain_unregister(&fib_chain, nb);
+	struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+	return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb);
 }
 EXPORT_SYMBOL(unregister_fib_notifier);
 
@@ -181,6 +171,7 @@ static int __net_init fib_notifier_net_init(struct net *net)
 	struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
 
 	INIT_LIST_HEAD(&fn_net->fib_notifier_ops);
+	ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain);
 	return 0;
 }
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index dd220ce7ca7a..28cbf07102bc 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -321,7 +321,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(fib_rules_lookup);
 
-static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib_rule_notifier(struct notifier_block *nb,
 				  enum fib_event_type event_type,
 				  struct fib_rule *rule, int family)
 {
@@ -330,7 +330,7 @@ static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
 		.rule = rule,
 	};
 
-	return call_fib_notifier(nb, net, event_type, &info.info);
+	return call_fib_notifier(nb, event_type, &info.info);
 }
 
 static int call_fib_rule_notifiers(struct net *net,
@@ -359,8 +359,7 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family)
 	if (!ops)
 		return -EAFNOSUPPORT;
 	list_for_each_entry_rcu(rule, &ops->rules_list, list)
-		call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule,
-				       family);
+		call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family);
 	rules_ops_put(ops);
 
 	return 0;
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index b804ccbdb241..1a128c1346fb 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -9,12 +9,12 @@
 #include <net/netns/ipv4.h>
 #include <net/ip_fib.h>
 
-int call_fib4_notifier(struct notifier_block *nb, struct net *net,
+int call_fib4_notifier(struct notifier_block *nb,
 		       enum fib_event_type event_type,
 		       struct fib_notifier_info *info)
 {
 	info->family = AF_INET;
-	return call_fib_notifier(nb, net, event_type, info);
+	return call_fib_notifier(nb, event_type, info);
 }
 
 int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1ab2fb6bb37d..5b600b2a2aa3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -74,7 +74,7 @@
 #include <trace/events/fib.h>
 #include "fib_lookup.h"
 
-static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib_entry_notifier(struct notifier_block *nb,
 				   enum fib_event_type event_type, u32 dst,
 				   int dst_len, struct fib_alias *fa)
 {
@@ -86,7 +86,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
 		.type = fa->fa_type,
 		.tb_id = fa->tb_id,
 	};
-	return call_fib4_notifier(nb, net, event_type, &info.info);
+	return call_fib4_notifier(nb, event_type, &info.info);
 }
 
 static int call_fib_entry_notifiers(struct net *net,
@@ -2015,8 +2015,8 @@ void fib_info_notify_update(struct net *net, struct nl_info *info)
 	}
 }
 
-static void fib_leaf_notify(struct net *net, struct key_vector *l,
-			    struct fib_table *tb, struct notifier_block *nb)
+static void fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
+			    struct notifier_block *nb)
 {
 	struct fib_alias *fa;
 
@@ -2032,20 +2032,19 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
 		if (tb->tb_id != fa->tb_id)
 			continue;
 
-		call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key,
+		call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key,
 					KEYLENGTH - fa->fa_slen, fa);
 	}
 }
 
-static void fib_table_notify(struct net *net, struct fib_table *tb,
-			     struct notifier_block *nb)
+static void fib_table_notify(struct fib_table *tb, struct notifier_block *nb)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct key_vector *l, *tp = t->kv;
 	t_key key = 0;
 
 	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
-		fib_leaf_notify(net, l, tb, nb);
+		fib_leaf_notify(l, tb, nb);
 
 		key = l->key + 1;
 		/* stop in case of wrap around */
@@ -2063,7 +2062,7 @@ void fib_notify(struct net *net, struct notifier_block *nb)
 		struct fib_table *tb;
 
 		hlist_for_each_entry_rcu(tb, head, tb_hlist)
-			fib_table_notify(net, tb, nb);
+			fib_table_notify(tb, nb);
 	}
 }
 
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index ea48bd15a575..4dcc3214e3cc 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -409,7 +409,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 			if (!v->dev)
 				continue;
 
-			mr_call_vif_notifier(nb, net, family,
+			mr_call_vif_notifier(nb, family,
 					     FIB_EVENT_VIF_ADD,
 					     v, vifi, mrt->id);
 		}
@@ -417,7 +417,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 
 		/* Notify on table MFC entries */
 		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
-			mr_call_mfc_notifier(nb, net, family,
+			mr_call_mfc_notifier(nb, family,
 					     FIB_EVENT_ENTRY_ADD,
 					     mfc, mrt->id);
 	}
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index 05f82baaa99e..4fe79296999a 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -7,12 +7,12 @@
 #include <net/netns/ipv6.h>
 #include <net/ip6_fib.h>
 
-int call_fib6_notifier(struct notifier_block *nb, struct net *net,
+int call_fib6_notifier(struct notifier_block *nb,
 		       enum fib_event_type event_type,
 		       struct fib_notifier_info *info)
 {
 	info->family = AF_INET6;
-	return call_fib_notifier(nb, net, event_type, info);
+	return call_fib_notifier(nb, event_type, info);
 }
 
 int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6e2af411cd9c..f6fae48b2e18 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -357,7 +357,7 @@ unsigned int fib6_tables_seq_read(struct net *net)
 	return fib_seq;
 }
 
-static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib6_entry_notifier(struct notifier_block *nb,
 				    enum fib_event_type event_type,
 				    struct fib6_info *rt)
 {
@@ -365,7 +365,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
 		.rt = rt,
 	};
 
-	return call_fib6_notifier(nb, net, event_type, &info.info);
+	return call_fib6_notifier(nb, event_type, &info.info);
 }
 
 int call_fib6_entry_notifiers(struct net *net,
@@ -407,7 +407,7 @@ static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
 {
 	if (rt == arg->net->ipv6.fib6_null_entry)
 		return;
-	call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
+	call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, rt);
 }
 
 static int fib6_node_dump(struct fib6_walker *w)
-- 
cgit v1.2.3-59-g8ed1b


From 55c894f762a1a99fca80ee55d593083d78e7e4fb Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Oct 2019 11:49:28 +0200
Subject: net: fib_notifier: propagate possible error during fib notifier
 registration

Unlike events for registered notifier, during the registration, the
errors that happened for the block being registered are not propagated
up to the caller. Make sure the error is propagated for FIB rules and
entries.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    |  2 +-
 net/core/fib_rules.c    | 11 ++++++++---
 net/ipv4/fib_notifier.c |  4 +---
 net/ipv4/fib_trie.c     | 31 ++++++++++++++++++++++---------
 net/ipv4/ipmr_base.c    | 22 +++++++++++++++-------
 net/ipv6/ip6_fib.c      | 36 ++++++++++++++++++++++++------------
 6 files changed, 71 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a9df85304f40..05c1fd9c5e23 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -229,7 +229,7 @@ int __net_init fib4_notifier_init(struct net *net);
 void __net_exit fib4_notifier_exit(struct net *net);
 
 void fib_info_notify_update(struct net *net, struct nl_info *info);
-void fib_notify(struct net *net, struct notifier_block *nb);
+int fib_notify(struct net *net, struct notifier_block *nb);
 
 struct fib_table {
 	struct hlist_node	tb_hlist;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 28cbf07102bc..592d8aef90e3 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -354,15 +354,20 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family)
 {
 	struct fib_rules_ops *ops;
 	struct fib_rule *rule;
+	int err = 0;
 
 	ops = lookup_rules_ops(net, family);
 	if (!ops)
 		return -EAFNOSUPPORT;
-	list_for_each_entry_rcu(rule, &ops->rules_list, list)
-		call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family);
+	list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+		err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD,
+					     rule, family);
+		if (err)
+			break;
+	}
 	rules_ops_put(ops);
 
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(fib_rules_dump);
 
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index 1a128c1346fb..0c57f68a9340 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -42,9 +42,7 @@ static int fib4_dump(struct net *net, struct notifier_block *nb)
 	if (err)
 		return err;
 
-	fib_notify(net, nb);
-
-	return 0;
+	return fib_notify(net, nb);
 }
 
 static const struct fib_notifier_ops fib4_notifier_ops_template = {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5b600b2a2aa3..568e59423773 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2015,10 +2015,11 @@ void fib_info_notify_update(struct net *net, struct nl_info *info)
 	}
 }
 
-static void fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
-			    struct notifier_block *nb)
+static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
+			   struct notifier_block *nb)
 {
 	struct fib_alias *fa;
+	int err;
 
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
 		struct fib_info *fi = fa->fa_info;
@@ -2032,38 +2033,50 @@ static void fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
 		if (tb->tb_id != fa->tb_id)
 			continue;
 
-		call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key,
-					KEYLENGTH - fa->fa_slen, fa);
+		err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key,
+					      KEYLENGTH - fa->fa_slen, fa);
+		if (err)
+			return err;
 	}
+	return 0;
 }
 
-static void fib_table_notify(struct fib_table *tb, struct notifier_block *nb)
+static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct key_vector *l, *tp = t->kv;
 	t_key key = 0;
+	int err;
 
 	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
-		fib_leaf_notify(l, tb, nb);
+		err = fib_leaf_notify(l, tb, nb);
+		if (err)
+			return err;
 
 		key = l->key + 1;
 		/* stop in case of wrap around */
 		if (key < l->key)
 			break;
 	}
+	return 0;
 }
 
-void fib_notify(struct net *net, struct notifier_block *nb)
+int fib_notify(struct net *net, struct notifier_block *nb)
 {
 	unsigned int h;
+	int err;
 
 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
 		struct fib_table *tb;
 
-		hlist_for_each_entry_rcu(tb, head, tb_hlist)
-			fib_table_notify(tb, nb);
+		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+			err = fib_table_notify(tb, nb);
+			if (err)
+				return err;
+		}
 	}
+	return 0;
 }
 
 static void __trie_free_rcu(struct rcu_head *head)
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 4dcc3214e3cc..c4e23c2a0d5c 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -409,17 +409,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 			if (!v->dev)
 				continue;
 
-			mr_call_vif_notifier(nb, family,
-					     FIB_EVENT_VIF_ADD,
-					     v, vifi, mrt->id);
+			err = mr_call_vif_notifier(nb, family,
+						   FIB_EVENT_VIF_ADD,
+						   v, vifi, mrt->id);
+			if (err)
+				break;
 		}
 		read_unlock(mrt_lock);
 
+		if (err)
+			return err;
+
 		/* Notify on table MFC entries */
-		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
-			mr_call_mfc_notifier(nb, family,
-					     FIB_EVENT_ENTRY_ADD,
-					     mfc, mrt->id);
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+			err = mr_call_mfc_notifier(nb, family,
+						   FIB_EVENT_ENTRY_ADD,
+						   mfc, mrt->id);
+			if (err)
+				return err;
+		}
 	}
 
 	return 0;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f6fae48b2e18..76124a909395 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -403,30 +403,37 @@ struct fib6_dump_arg {
 	struct notifier_block *nb;
 };
 
-static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
+static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
 {
 	if (rt == arg->net->ipv6.fib6_null_entry)
-		return;
-	call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, rt);
+		return 0;
+	return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, rt);
 }
 
 static int fib6_node_dump(struct fib6_walker *w)
 {
 	struct fib6_info *rt;
+	int err = 0;
 
-	for_each_fib6_walker_rt(w)
-		fib6_rt_dump(rt, w->args);
+	for_each_fib6_walker_rt(w) {
+		err = fib6_rt_dump(rt, w->args);
+		if (err)
+			break;
+	}
 	w->leaf = NULL;
-	return 0;
+	return err;
 }
 
-static void fib6_table_dump(struct net *net, struct fib6_table *tb,
-			    struct fib6_walker *w)
+static int fib6_table_dump(struct net *net, struct fib6_table *tb,
+			   struct fib6_walker *w)
 {
+	int err;
+
 	w->root = &tb->tb6_root;
 	spin_lock_bh(&tb->tb6_lock);
-	fib6_walk(net, w);
+	err = fib6_walk(net, w);
 	spin_unlock_bh(&tb->tb6_lock);
+	return err;
 }
 
 /* Called with rcu_read_lock() */
@@ -435,6 +442,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
 	struct fib6_dump_arg arg;
 	struct fib6_walker *w;
 	unsigned int h;
+	int err = 0;
 
 	w = kzalloc(sizeof(*w), GFP_ATOMIC);
 	if (!w)
@@ -449,13 +457,17 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
 		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
 		struct fib6_table *tb;
 
-		hlist_for_each_entry_rcu(tb, head, tb6_hlist)
-			fib6_table_dump(net, tb, w);
+		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+			err = fib6_table_dump(net, tb, w);
+			if (err < 0)
+				goto out;
+		}
 	}
 
+out:
 	kfree(w);
 
-	return 0;
+	return err;
 }
 
 static int fib6_dump_node(struct fib6_walker *w)
-- 
cgit v1.2.3-59-g8ed1b


From b7a595577ef3dc9add2b3e6d00869d017306bfbe Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Oct 2019 11:49:30 +0200
Subject: net: fib_notifier: propagate extack down to the notifier block
 callback

Since errors are propagated all the way up to the caller, propagate
possible extack of the caller all the way down to the notifier block
callback.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c     |  2 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c    |  2 +-
 drivers/net/ethernet/rocker/rocker_main.c            |  2 +-
 drivers/net/netdevsim/fib.c                          |  2 +-
 include/linux/mroute_base.h                          | 18 ++++++++++++------
 include/net/fib_notifier.h                           |  6 ++++--
 include/net/fib_rules.h                              |  3 ++-
 include/net/ip6_fib.h                                |  9 ++++++---
 include/net/ip_fib.h                                 |  9 ++++++---
 net/core/fib_notifier.c                              | 10 ++++++----
 net/core/fib_rules.c                                 |  9 ++++++---
 net/ipv4/fib_notifier.c                              |  7 ++++---
 net/ipv4/fib_rules.c                                 |  5 +++--
 net/ipv4/fib_trie.c                                  | 20 +++++++++++++-------
 net/ipv4/ipmr.c                                      | 13 ++++++++-----
 net/ipv4/ipmr_base.c                                 | 12 +++++++-----
 net/ipv6/fib6_notifier.c                             |  7 ++++---
 net/ipv6/fib6_rules.c                                |  5 +++--
 net/ipv6/ip6_fib.c                                   | 12 +++++++++---
 net/ipv6/ip6mr.c                                     | 13 ++++++++-----
 20 files changed, 105 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
index fe0cc969cf94..13e2944b1274 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
@@ -309,7 +309,7 @@ int mlx5_lag_mp_init(struct mlx5_lag *ldev)
 
 	mp->fib_nb.notifier_call = mlx5_lag_fib_event;
 	err = register_fib_notifier(&init_net, &mp->fib_nb,
-				    mlx5_lag_fib_event_flush);
+				    mlx5_lag_fib_event_flush, NULL);
 	if (err)
 		mp->fib_nb.notifier_call = NULL;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 1eeff1d23b13..445e2daa54ac 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -8135,7 +8135,7 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
 
 	mlxsw_sp->router->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
 	err = register_fib_notifier(&init_net, &mlxsw_sp->router->fib_nb,
-				    mlxsw_sp_router_fib_dump_flush);
+				    mlxsw_sp_router_fib_dump_flush, NULL);
 	if (err)
 		goto err_register_fib_notifier;
 
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index e54f6341a785..bc4f951315da 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -2991,7 +2991,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	 * the device, so no need to pass a callback.
 	 */
 	rocker->fib_nb.notifier_call = rocker_router_fib_event;
-	err = register_fib_notifier(&init_net, &rocker->fib_nb, NULL);
+	err = register_fib_notifier(&init_net, &rocker->fib_nb, NULL, NULL);
 	if (err)
 		goto err_register_fib_notifier;
 
diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c
index 01ee9cc54605..d2aeac0f4c2c 100644
--- a/drivers/net/netdevsim/fib.c
+++ b/drivers/net/netdevsim/fib.c
@@ -256,7 +256,7 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink)
 
 	data->fib_nb.notifier_call = nsim_fib_event_nb;
 	err = register_fib_notifier(&init_net, &data->fib_nb,
-				    nsim_fib_dump_inconsistent);
+				    nsim_fib_dump_inconsistent, NULL);
 	if (err) {
 		pr_err("Failed to register fib notifier\n");
 		goto err_out;
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 0931631bbc13..8071148f29a6 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -50,11 +50,13 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb,
 				       unsigned short family,
 				       enum fib_event_type event_type,
 				       struct vif_device *vif,
-				       unsigned short vif_index, u32 tb_id)
+				       unsigned short vif_index, u32 tb_id,
+				       struct netlink_ext_ack *extack)
 {
 	struct vif_entry_notifier_info info = {
 		.info = {
 			.family = family,
+			.extack = extack,
 		},
 		.dev = vif->dev,
 		.vif_index = vif_index,
@@ -172,11 +174,13 @@ struct mfc_entry_notifier_info {
 static inline int mr_call_mfc_notifier(struct notifier_block *nb,
 				       unsigned short family,
 				       enum fib_event_type event_type,
-				       struct mr_mfc *mfc, u32 tb_id)
+				       struct mr_mfc *mfc, u32 tb_id,
+				       struct netlink_ext_ack *extack)
 {
 	struct mfc_entry_notifier_info info = {
 		.info = {
 			.family = family,
+			.extack = extack,
 		},
 		.mfc = mfc,
 		.tb_id = tb_id
@@ -295,10 +299,11 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 
 int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 	    int (*rules_dump)(struct net *net,
-			      struct notifier_block *nb),
+			      struct notifier_block *nb,
+			      struct netlink_ext_ack *extack),
 	    struct mr_table *(*mr_iter)(struct net *net,
 					struct mr_table *mrt),
-	    rwlock_t *mrt_lock);
+	    rwlock_t *mrt_lock, struct netlink_ext_ack *extack);
 #else
 static inline void vif_device_init(struct vif_device *v,
 				   struct net_device *dev,
@@ -349,10 +354,11 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 static inline int mr_dump(struct net *net, struct notifier_block *nb,
 			  unsigned short family,
 			  int (*rules_dump)(struct net *net,
-					    struct notifier_block *nb),
+					    struct notifier_block *nb,
+					    struct netlink_ext_ack *extack),
 			  struct mr_table *(*mr_iter)(struct net *net,
 						      struct mr_table *mrt),
-			  rwlock_t *mrt_lock)
+			  rwlock_t *mrt_lock, struct netlink_ext_ack *extack)
 {
 	return -EINVAL;
 }
diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
index 23353f67b2b0..6d59221ff05a 100644
--- a/include/net/fib_notifier.h
+++ b/include/net/fib_notifier.h
@@ -29,7 +29,8 @@ struct fib_notifier_ops {
 	int family;
 	struct list_head list;
 	unsigned int (*fib_seq_read)(struct net *net);
-	int (*fib_dump)(struct net *net, struct notifier_block *nb);
+	int (*fib_dump)(struct net *net, struct notifier_block *nb,
+			struct netlink_ext_ack *extack);
 	struct module *owner;
 	struct rcu_head rcu;
 };
@@ -40,7 +41,8 @@ int call_fib_notifier(struct notifier_block *nb,
 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 		       struct fib_notifier_info *info);
 int register_fib_notifier(struct net *net, struct notifier_block *nb,
-			  void (*cb)(struct notifier_block *nb));
+			  void (*cb)(struct notifier_block *nb),
+			  struct netlink_ext_ack *extack);
 int unregister_fib_notifier(struct net *net, struct notifier_block *nb);
 struct fib_notifier_ops *
 fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net);
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 20dcadd8eed9..54e227e6b06a 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -194,7 +194,8 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags,
 int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table,
 			 u32 flags);
 bool fib_rule_matchall(const struct fib_rule *rule);
-int fib_rules_dump(struct net *net, struct notifier_block *nb, int family);
+int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
+		   struct netlink_ext_ack *extack);
 unsigned int fib_rules_seq_read(struct net *net, int family);
 
 int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 14e9fca0e326..5d1615463138 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -488,7 +488,8 @@ int __net_init fib6_notifier_init(struct net *net);
 void __net_exit fib6_notifier_exit(struct net *net);
 
 unsigned int fib6_tables_seq_read(struct net *net);
-int fib6_tables_dump(struct net *net, struct notifier_block *nb);
+int fib6_tables_dump(struct net *net, struct notifier_block *nb,
+		     struct netlink_ext_ack *extack);
 
 void fib6_update_sernum(struct net *net, struct fib6_info *rt);
 void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt);
@@ -504,7 +505,8 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
 bool fib6_rule_default(const struct fib_rule *rule);
-int fib6_rules_dump(struct net *net, struct notifier_block *nb);
+int fib6_rules_dump(struct net *net, struct notifier_block *nb,
+		    struct netlink_ext_ack *extack);
 unsigned int fib6_rules_seq_read(struct net *net);
 
 static inline bool fib6_rules_early_flow_dissect(struct net *net,
@@ -537,7 +539,8 @@ static inline bool fib6_rule_default(const struct fib_rule *rule)
 {
 	return true;
 }
-static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb)
+static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb,
+				  struct netlink_ext_ack *extack)
 {
 	return 0;
 }
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 05c1fd9c5e23..52b2406a5dfc 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -229,7 +229,8 @@ int __net_init fib4_notifier_init(struct net *net);
 void __net_exit fib4_notifier_exit(struct net *net);
 
 void fib_info_notify_update(struct net *net, struct nl_info *info);
-int fib_notify(struct net *net, struct notifier_block *nb);
+int fib_notify(struct net *net, struct notifier_block *nb,
+	       struct netlink_ext_ack *extack);
 
 struct fib_table {
 	struct hlist_node	tb_hlist;
@@ -315,7 +316,8 @@ static inline bool fib4_rule_default(const struct fib_rule *rule)
 	return true;
 }
 
-static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb)
+static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb,
+				  struct netlink_ext_ack *extack)
 {
 	return 0;
 }
@@ -377,7 +379,8 @@ out:
 }
 
 bool fib4_rule_default(const struct fib_rule *rule);
-int fib4_rules_dump(struct net *net, struct notifier_block *nb);
+int fib4_rules_dump(struct net *net, struct notifier_block *nb,
+		    struct netlink_ext_ack *extack);
 unsigned int fib4_rules_seq_read(struct net *net);
 
 static inline bool fib4_rules_early_flow_dissect(struct net *net,
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index fbd029425638..fc96259807b6 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -57,7 +57,8 @@ static unsigned int fib_seq_sum(struct net *net)
 	return fib_seq;
 }
 
-static int fib_net_dump(struct net *net, struct notifier_block *nb)
+static int fib_net_dump(struct net *net, struct notifier_block *nb,
+			struct netlink_ext_ack *extack)
 {
 	struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
 	struct fib_notifier_ops *ops;
@@ -67,7 +68,7 @@ static int fib_net_dump(struct net *net, struct notifier_block *nb)
 	list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
 		if (!try_module_get(ops->owner))
 			continue;
-		err = ops->fib_dump(net, nb);
+		err = ops->fib_dump(net, nb, extack);
 		module_put(ops->owner);
 		if (err)
 			goto unlock;
@@ -96,7 +97,8 @@ static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb,
 
 #define FIB_DUMP_MAX_RETRIES 5
 int register_fib_notifier(struct net *net, struct notifier_block *nb,
-			  void (*cb)(struct notifier_block *nb))
+			  void (*cb)(struct notifier_block *nb),
+			  struct netlink_ext_ack *extack)
 {
 	int retries = 0;
 	int err;
@@ -104,7 +106,7 @@ int register_fib_notifier(struct net *net, struct notifier_block *nb,
 	do {
 		unsigned int fib_seq = fib_seq_sum(net);
 
-		err = fib_net_dump(net, nb);
+		err = fib_net_dump(net, nb, extack);
 		if (err)
 			return err;
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 592d8aef90e3..3e7e15278c46 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -323,10 +323,12 @@ EXPORT_SYMBOL_GPL(fib_rules_lookup);
 
 static int call_fib_rule_notifier(struct notifier_block *nb,
 				  enum fib_event_type event_type,
-				  struct fib_rule *rule, int family)
+				  struct fib_rule *rule, int family,
+				  struct netlink_ext_ack *extack)
 {
 	struct fib_rule_notifier_info info = {
 		.info.family = family,
+		.info.extack = extack,
 		.rule = rule,
 	};
 
@@ -350,7 +352,8 @@ static int call_fib_rule_notifiers(struct net *net,
 }
 
 /* Called with rcu_read_lock() */
-int fib_rules_dump(struct net *net, struct notifier_block *nb, int family)
+int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
+		   struct netlink_ext_ack *extack)
 {
 	struct fib_rules_ops *ops;
 	struct fib_rule *rule;
@@ -361,7 +364,7 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family)
 		return -EAFNOSUPPORT;
 	list_for_each_entry_rcu(rule, &ops->rules_list, list) {
 		err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD,
-					     rule, family);
+					     rule, family, extack);
 		if (err)
 			break;
 	}
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index 0c57f68a9340..0c28bd469a68 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -34,15 +34,16 @@ static unsigned int fib4_seq_read(struct net *net)
 	return net->ipv4.fib_seq + fib4_rules_seq_read(net);
 }
 
-static int fib4_dump(struct net *net, struct notifier_block *nb)
+static int fib4_dump(struct net *net, struct notifier_block *nb,
+		     struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = fib4_rules_dump(net, nb);
+	err = fib4_rules_dump(net, nb, extack);
 	if (err)
 		return err;
 
-	return fib_notify(net, nb);
+	return fib_notify(net, nb, extack);
 }
 
 static const struct fib_notifier_ops fib4_notifier_ops_template = {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index b43a7ba5c6a4..f99e3bac5cab 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -65,9 +65,10 @@ bool fib4_rule_default(const struct fib_rule *rule)
 }
 EXPORT_SYMBOL_GPL(fib4_rule_default);
 
-int fib4_rules_dump(struct net *net, struct notifier_block *nb)
+int fib4_rules_dump(struct net *net, struct notifier_block *nb,
+		    struct netlink_ext_ack *extack)
 {
-	return fib_rules_dump(net, nb, AF_INET);
+	return fib_rules_dump(net, nb, AF_INET, extack);
 }
 
 unsigned int fib4_rules_seq_read(struct net *net)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 568e59423773..b9df9c09b84e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -76,9 +76,11 @@
 
 static int call_fib_entry_notifier(struct notifier_block *nb,
 				   enum fib_event_type event_type, u32 dst,
-				   int dst_len, struct fib_alias *fa)
+				   int dst_len, struct fib_alias *fa,
+				   struct netlink_ext_ack *extack)
 {
 	struct fib_entry_notifier_info info = {
+		.info.extack = extack,
 		.dst = dst,
 		.dst_len = dst_len,
 		.fi = fa->fa_info,
@@ -2016,7 +2018,8 @@ void fib_info_notify_update(struct net *net, struct nl_info *info)
 }
 
 static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
-			   struct notifier_block *nb)
+			   struct notifier_block *nb,
+			   struct netlink_ext_ack *extack)
 {
 	struct fib_alias *fa;
 	int err;
@@ -2034,14 +2037,16 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
 			continue;
 
 		err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key,
-					      KEYLENGTH - fa->fa_slen, fa);
+					      KEYLENGTH - fa->fa_slen,
+					      fa, extack);
 		if (err)
 			return err;
 	}
 	return 0;
 }
 
-static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb)
+static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
+			    struct netlink_ext_ack *extack)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct key_vector *l, *tp = t->kv;
@@ -2049,7 +2054,7 @@ static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb)
 	int err;
 
 	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
-		err = fib_leaf_notify(l, tb, nb);
+		err = fib_leaf_notify(l, tb, nb, extack);
 		if (err)
 			return err;
 
@@ -2061,7 +2066,8 @@ static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb)
 	return 0;
 }
 
-int fib_notify(struct net *net, struct notifier_block *nb)
+int fib_notify(struct net *net, struct notifier_block *nb,
+	       struct netlink_ext_ack *extack)
 {
 	unsigned int h;
 	int err;
@@ -2071,7 +2077,7 @@ int fib_notify(struct net *net, struct notifier_block *nb)
 		struct fib_table *tb;
 
 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
-			err = fib_table_notify(tb, nb);
+			err = fib_table_notify(tb, nb, extack);
 			if (err)
 				return err;
 		}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 313470f6bb14..051f365b64d2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -278,9 +278,10 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	rtnl_unlock();
 }
 
-static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+			   struct netlink_ext_ack *extack)
 {
-	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack);
 }
 
 static unsigned int ipmr_rules_seq_read(struct net *net)
@@ -336,7 +337,8 @@ static void __net_exit ipmr_rules_exit(struct net *net)
 	rtnl_unlock();
 }
 
-static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+			   struct netlink_ext_ack *extack)
 {
 	return 0;
 }
@@ -3040,10 +3042,11 @@ static unsigned int ipmr_seq_read(struct net *net)
 	return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
 }
 
-static int ipmr_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_dump(struct net *net, struct notifier_block *nb,
+		     struct netlink_ext_ack *extack)
 {
 	return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
-		       ipmr_mr_table_iter, &mrt_lock);
+		       ipmr_mr_table_iter, &mrt_lock, extack);
 }
 
 static const struct fib_notifier_ops ipmr_notifier_ops_template = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index c4e23c2a0d5c..aa8738a91210 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -386,15 +386,17 @@ EXPORT_SYMBOL(mr_rtm_dumproute);
 
 int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 	    int (*rules_dump)(struct net *net,
-			      struct notifier_block *nb),
+			      struct notifier_block *nb,
+			      struct netlink_ext_ack *extack),
 	    struct mr_table *(*mr_iter)(struct net *net,
 					struct mr_table *mrt),
-	    rwlock_t *mrt_lock)
+	    rwlock_t *mrt_lock,
+	    struct netlink_ext_ack *extack)
 {
 	struct mr_table *mrt;
 	int err;
 
-	err = rules_dump(net, nb);
+	err = rules_dump(net, nb, extack);
 	if (err)
 		return err;
 
@@ -411,7 +413,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 
 			err = mr_call_vif_notifier(nb, family,
 						   FIB_EVENT_VIF_ADD,
-						   v, vifi, mrt->id);
+						   v, vifi, mrt->id, extack);
 			if (err)
 				break;
 		}
@@ -424,7 +426,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
 			err = mr_call_mfc_notifier(nb, family,
 						   FIB_EVENT_ENTRY_ADD,
-						   mfc, mrt->id);
+						   mfc, mrt->id, extack);
 			if (err)
 				return err;
 		}
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index 4fe79296999a..f87ae33e1d01 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -27,15 +27,16 @@ static unsigned int fib6_seq_read(struct net *net)
 	return fib6_tables_seq_read(net) + fib6_rules_seq_read(net);
 }
 
-static int fib6_dump(struct net *net, struct notifier_block *nb)
+static int fib6_dump(struct net *net, struct notifier_block *nb,
+		     struct netlink_ext_ack *extack)
 {
 	int err;
 
-	err = fib6_rules_dump(net, nb);
+	err = fib6_rules_dump(net, nb, extack);
 	if (err)
 		return err;
 
-	return fib6_tables_dump(net, nb);
+	return fib6_tables_dump(net, nb, extack);
 }
 
 static const struct fib_notifier_ops fib6_notifier_ops_template = {
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index f9e8fe3ff0c5..fafe556d21e0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -47,9 +47,10 @@ bool fib6_rule_default(const struct fib_rule *rule)
 }
 EXPORT_SYMBOL_GPL(fib6_rule_default);
 
-int fib6_rules_dump(struct net *net, struct notifier_block *nb)
+int fib6_rules_dump(struct net *net, struct notifier_block *nb,
+		    struct netlink_ext_ack *extack)
 {
-	return fib_rules_dump(net, nb, AF_INET6);
+	return fib_rules_dump(net, nb, AF_INET6, extack);
 }
 
 unsigned int fib6_rules_seq_read(struct net *net)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 76124a909395..f66bc2af4e9d 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -359,9 +359,11 @@ unsigned int fib6_tables_seq_read(struct net *net)
 
 static int call_fib6_entry_notifier(struct notifier_block *nb,
 				    enum fib_event_type event_type,
-				    struct fib6_info *rt)
+				    struct fib6_info *rt,
+				    struct netlink_ext_ack *extack)
 {
 	struct fib6_entry_notifier_info info = {
+		.info.extack = extack,
 		.rt = rt,
 	};
 
@@ -401,13 +403,15 @@ int call_fib6_multipath_entry_notifiers(struct net *net,
 struct fib6_dump_arg {
 	struct net *net;
 	struct notifier_block *nb;
+	struct netlink_ext_ack *extack;
 };
 
 static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
 {
 	if (rt == arg->net->ipv6.fib6_null_entry)
 		return 0;
-	return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, rt);
+	return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD,
+					rt, arg->extack);
 }
 
 static int fib6_node_dump(struct fib6_walker *w)
@@ -437,7 +441,8 @@ static int fib6_table_dump(struct net *net, struct fib6_table *tb,
 }
 
 /* Called with rcu_read_lock() */
-int fib6_tables_dump(struct net *net, struct notifier_block *nb)
+int fib6_tables_dump(struct net *net, struct notifier_block *nb,
+		     struct netlink_ext_ack *extack)
 {
 	struct fib6_dump_arg arg;
 	struct fib6_walker *w;
@@ -451,6 +456,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
 	w->func = fib6_node_dump;
 	arg.net = net;
 	arg.nb = nb;
+	arg.extack = extack;
 	w->args = &arg;
 
 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 857a89ad4d6c..bfa49ff70531 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -265,9 +265,10 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 	rtnl_unlock();
 }
 
-static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
+			    struct netlink_ext_ack *extack)
 {
-	return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR);
+	return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack);
 }
 
 static unsigned int ip6mr_rules_seq_read(struct net *net)
@@ -324,7 +325,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 	rtnl_unlock();
 }
 
-static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
+			    struct netlink_ext_ack *extack)
 {
 	return 0;
 }
@@ -1256,10 +1258,11 @@ static unsigned int ip6mr_seq_read(struct net *net)
 	return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net);
 }
 
-static int ip6mr_dump(struct net *net, struct notifier_block *nb)
+static int ip6mr_dump(struct net *net, struct notifier_block *nb,
+		      struct netlink_ext_ack *extack)
 {
 	return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
-		       ip6mr_mr_table_iter, &mrt_lock);
+		       ip6mr_mr_table_iter, &mrt_lock, extack);
 }
 
 static struct notifier_block ip6_mr_notifier = {
-- 
cgit v1.2.3-59-g8ed1b


From 471f894f106573b0b086d1003ee6172253c67b59 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Oct 2019 11:49:31 +0200
Subject: net: devlink: export devlink net getter

Allow drivers to get net struct for devlink instance.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 1 +
 net/core/devlink.c    | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 23e4b65ec9df..5ac2be0f0857 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -771,6 +771,7 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev)
 
 struct ib_device;
 
+struct net *devlink_net(const struct devlink *devlink);
 struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
 int devlink_register(struct devlink *devlink, struct device *dev);
 void devlink_unregister(struct devlink *devlink);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index e48680efe54a..362cbbcca225 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -95,10 +95,11 @@ static LIST_HEAD(devlink_list);
  */
 static DEFINE_MUTEX(devlink_mutex);
 
-static struct net *devlink_net(const struct devlink *devlink)
+struct net *devlink_net(const struct devlink *devlink)
 {
 	return read_pnet(&devlink->_net);
 }
+EXPORT_SYMBOL_GPL(devlink_net);
 
 static void devlink_net_set(struct devlink *devlink, struct net *net)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 070c63f20f6c739a3c534555f56c7327536bfcc2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Oct 2019 11:49:39 +0200
Subject: net: devlink: allow to change namespaces during reload

All devlink instances are created in init_net and stay there for a
lifetime. Allow user to be able to move devlink instances into
namespaces during devlink reload operation. That ensures proper
re-instantiation of driver objects, including netdevices.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c  |   6 +-
 drivers/net/ethernet/mellanox/mlxsw/core.c |   1 +
 drivers/net/netdevsim/dev.c                |   2 +-
 include/net/devlink.h                      |   2 +-
 include/uapi/linux/devlink.h               |   4 +
 net/core/devlink.c                         | 154 +++++++++++++++++++++++++++--
 6 files changed, 158 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index fce9b3a24347..22c72fb7206a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3935,13 +3935,17 @@ static void mlx4_restart_one_down(struct pci_dev *pdev);
 static int mlx4_restart_one_up(struct pci_dev *pdev, bool reload,
 			       struct devlink *devlink);
 
-static int mlx4_devlink_reload_down(struct devlink *devlink,
+static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change,
 				    struct netlink_ext_ack *extack)
 {
 	struct mlx4_priv *priv = devlink_priv(devlink);
 	struct mlx4_dev *dev = &priv->dev;
 	struct mlx4_dev_persistent *persist = dev->persist;
 
+	if (netns_change) {
+		NL_SET_ERR_MSG_MOD(extack, "Namespace change is not supported");
+		return -EOPNOTSUPP;
+	}
 	if (persist->num_vfs)
 		mlx4_warn(persist->dev, "Reload performed on PF, will cause reset on operating Virtual Functions\n");
 	mlx4_restart_one_down(persist->pdev);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 1e61a012ca43..1c29522a2af3 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -985,6 +985,7 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req,
 
 static int
 mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink,
+					  bool netns_change,
 					  struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 7de80faab047..3f3c7cc21077 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -473,7 +473,7 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev,
 				  struct netlink_ext_ack *extack);
 static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev);
 
-static int nsim_dev_reload_down(struct devlink *devlink,
+static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change,
 				struct netlink_ext_ack *extack)
 {
 	struct nsim_dev *nsim_dev = devlink_priv(devlink);
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 5ac2be0f0857..3c9d4a063c98 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -643,7 +643,7 @@ enum devlink_trap_group_generic_id {
 	}
 
 struct devlink_ops {
-	int (*reload_down)(struct devlink *devlink,
+	int (*reload_down)(struct devlink *devlink, bool netns_change,
 			   struct netlink_ext_ack *extack);
 	int (*reload_up)(struct devlink *devlink,
 			 struct netlink_ext_ack *extack);
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 580b7a2e40e1..b558ea88b766 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -421,6 +421,10 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_RELOAD_FAILED,			/* u8 0 or 1 */
 
+	DEVLINK_ATTR_NETNS_FD,			/* u32 */
+	DEVLINK_ATTR_NETNS_PID,			/* u32 */
+	DEVLINK_ATTR_NETNS_ID,			/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 362cbbcca225..c4d8c4ab0fb5 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -435,8 +435,16 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
 {
 	struct devlink *devlink;
 
-	devlink = devlink_get_from_info(info);
-	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+	/* When devlink changes netns, it would not be found
+	 * by devlink_get_from_info(). So try if it is stored first.
+	 */
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
+		devlink = info->user_ptr[0];
+	} else {
+		devlink = devlink_get_from_info(info);
+		WARN_ON(IS_ERR(devlink));
+	}
+	if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
 		mutex_unlock(&devlink->lock);
 	mutex_unlock(&devlink_mutex);
 }
@@ -2675,6 +2683,72 @@ devlink_resources_validate(struct devlink *devlink,
 	return err;
 }
 
+static struct net *devlink_netns_get(struct sk_buff *skb,
+				     struct genl_info *info)
+{
+	struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID];
+	struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD];
+	struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID];
+	struct net *net;
+
+	if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
+		NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (netns_pid_attr) {
+		net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr));
+	} else if (netns_fd_attr) {
+		net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr));
+	} else if (netns_id_attr) {
+		net = get_net_ns_by_id(sock_net(skb->sk),
+				       nla_get_u32(netns_id_attr));
+		if (!net)
+			net = ERR_PTR(-EINVAL);
+	} else {
+		WARN_ON(1);
+		net = ERR_PTR(-EINVAL);
+	}
+	if (IS_ERR(net)) {
+		NL_SET_ERR_MSG(info->extack, "Unknown network namespace");
+		return ERR_PTR(-EINVAL);
+	}
+	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+		put_net(net);
+		return ERR_PTR(-EPERM);
+	}
+	return net;
+}
+
+static void devlink_param_notify(struct devlink *devlink,
+				 unsigned int port_index,
+				 struct devlink_param_item *param_item,
+				 enum devlink_command cmd);
+
+static void devlink_reload_netns_change(struct devlink *devlink,
+					struct net *dest_net)
+{
+	struct devlink_param_item *param_item;
+
+	/* Userspace needs to be notified about devlink objects
+	 * removed from original and entering new network namespace.
+	 * The rest of the devlink objects are re-created during
+	 * reload process so the notifications are generated separatelly.
+	 */
+
+	list_for_each_entry(param_item, &devlink->param_list, list)
+		devlink_param_notify(devlink, 0, param_item,
+				     DEVLINK_CMD_PARAM_DEL);
+	devlink_notify(devlink, DEVLINK_CMD_DEL);
+
+	devlink_net_set(devlink, dest_net);
+
+	devlink_notify(devlink, DEVLINK_CMD_NEW);
+	list_for_each_entry(param_item, &devlink->param_list, list)
+		devlink_param_notify(devlink, 0, param_item,
+				     DEVLINK_CMD_PARAM_NEW);
+}
+
 static bool devlink_reload_supported(struct devlink *devlink)
 {
 	return devlink->ops->reload_down && devlink->ops->reload_up;
@@ -2695,9 +2769,27 @@ bool devlink_is_reload_failed(const struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
 
+static int devlink_reload(struct devlink *devlink, struct net *dest_net,
+			  struct netlink_ext_ack *extack)
+{
+	int err;
+
+	err = devlink->ops->reload_down(devlink, !!dest_net, extack);
+	if (err)
+		return err;
+
+	if (dest_net && !net_eq(dest_net, devlink_net(devlink)))
+		devlink_reload_netns_change(devlink, dest_net);
+
+	err = devlink->ops->reload_up(devlink, extack);
+	devlink_reload_failed_set(devlink, !!err);
+	return err;
+}
+
 static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
+	struct net *dest_net = NULL;
 	int err;
 
 	if (!devlink_reload_supported(devlink))
@@ -2708,11 +2800,20 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
 		NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
 		return err;
 	}
-	err = devlink->ops->reload_down(devlink, info->extack);
-	if (err)
-		return err;
-	err = devlink->ops->reload_up(devlink, info->extack);
-	devlink_reload_failed_set(devlink, !!err);
+
+	if (info->attrs[DEVLINK_ATTR_NETNS_PID] ||
+	    info->attrs[DEVLINK_ATTR_NETNS_FD] ||
+	    info->attrs[DEVLINK_ATTR_NETNS_ID]) {
+		dest_net = devlink_netns_get(skb, info);
+		if (IS_ERR(dest_net))
+			return PTR_ERR(dest_net);
+	}
+
+	err = devlink_reload(devlink, dest_net, info->extack);
+
+	if (dest_net)
+		put_net(dest_net);
+
 	return err;
 }
 
@@ -5794,6 +5895,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 },
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -8061,9 +8165,43 @@ int devlink_compat_switch_id_get(struct net_device *dev,
 	return 0;
 }
 
+static void __net_exit devlink_pernet_pre_exit(struct net *net)
+{
+	struct devlink *devlink;
+	int err;
+
+	/* In case network namespace is getting destroyed, reload
+	 * all devlink instances from this namespace into init_net.
+	 */
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (net_eq(devlink_net(devlink), net)) {
+			if (WARN_ON(!devlink_reload_supported(devlink)))
+				continue;
+			err = devlink_reload(devlink, &init_net, NULL);
+			if (err)
+				pr_warn("Failed to reload devlink instance into init_net\n");
+		}
+	}
+	mutex_unlock(&devlink_mutex);
+}
+
+static struct pernet_operations devlink_pernet_ops __net_initdata = {
+	.pre_exit = devlink_pernet_pre_exit,
+};
+
 static int __init devlink_init(void)
 {
-	return genl_register_family(&devlink_nl_family);
+	int err;
+
+	err = genl_register_family(&devlink_nl_family);
+	if (err)
+		goto out;
+	err = register_pernet_subsys(&devlink_pernet_ops);
+
+out:
+	WARN_ON(err);
+	return err;
 }
 
 subsys_initcall(devlink_init);
-- 
cgit v1.2.3-59-g8ed1b


From 25a3cd8189c8832c04225e6f1d41228fd6cc64cc Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 Oct 2019 11:18:54 -0700
Subject: net/tls: move TOE-related structures to a separate header

Move tls_device structure and register/unregister functions
to a new header to avoid confusion with normal, non-TOE offload.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls.h |  1 +
 include/net/tls.h                    | 34 -----------------
 include/net/tls_toe.h                | 73 ++++++++++++++++++++++++++++++++++++
 net/tls/tls_main.c                   |  1 +
 4 files changed, 75 insertions(+), 34 deletions(-)
 create mode 100644 include/net/tls_toe.h

(limited to 'include')

diff --git a/drivers/crypto/chelsio/chtls/chtls.h b/drivers/crypto/chelsio/chtls/chtls.h
index 025c831d0899..e353c42fea91 100644
--- a/drivers/crypto/chelsio/chtls/chtls.h
+++ b/drivers/crypto/chelsio/chtls/chtls.h
@@ -21,6 +21,7 @@
 #include <crypto/internal/hash.h>
 #include <linux/tls.h>
 #include <net/tls.h>
+#include <net/tls_toe.h>
 
 #include "t4fw_api.h"
 #include "t4_msg.h"
diff --git a/include/net/tls.h b/include/net/tls.h
index c664e6dba0d1..57865c944095 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -60,7 +60,6 @@
 #define TLS_RECORD_TYPE_DATA		0x17
 
 #define TLS_AAD_SPACE_SIZE		13
-#define TLS_DEVICE_NAME_MAX		32
 
 #define MAX_IV_SIZE			16
 #define TLS_MAX_REC_SEQ_SIZE		8
@@ -74,37 +73,6 @@
  */
 #define TLS_AES_CCM_IV_B0_BYTE		2
 
-/*
- * This structure defines the routines for Inline TLS driver.
- * The following routines are optional and filled with a
- * null pointer if not defined.
- *
- * @name: Its the name of registered Inline tls device
- * @dev_list: Inline tls device list
- * int (*feature)(struct tls_device *device);
- *     Called to return Inline TLS driver capability
- *
- * int (*hash)(struct tls_device *device, struct sock *sk);
- *     This function sets Inline driver for listen and program
- *     device specific functioanlity as required
- *
- * void (*unhash)(struct tls_device *device, struct sock *sk);
- *     This function cleans listen state set by Inline TLS driver
- *
- * void (*release)(struct kref *kref);
- *     Release the registered device and allocated resources
- * @kref: Number of reference to tls_device
- */
-struct tls_device {
-	char name[TLS_DEVICE_NAME_MAX];
-	struct list_head dev_list;
-	int  (*feature)(struct tls_device *device);
-	int  (*hash)(struct tls_device *device, struct sock *sk);
-	void (*unhash)(struct tls_device *device, struct sock *sk);
-	void (*release)(struct kref *kref);
-	struct kref kref;
-};
-
 enum {
 	TLS_BASE,
 	TLS_SW,
@@ -643,8 +611,6 @@ static inline bool tls_offload_tx_resync_pending(struct sock *sk)
 
 int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
 		      unsigned char *record_type);
-void tls_register_device(struct tls_device *device);
-void tls_unregister_device(struct tls_device *device);
 int decrypt_skb(struct sock *sk, struct sk_buff *skb,
 		struct scatterlist *sgout);
 struct sk_buff *tls_encrypt_skb(struct sk_buff *skb);
diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h
new file mode 100644
index 000000000000..81b66c76b31f
--- /dev/null
+++ b/include/net/tls_toe.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kref.h>
+#include <linux/list.h>
+
+struct sock;
+
+#define TLS_DEVICE_NAME_MAX		32
+
+/*
+ * This structure defines the routines for Inline TLS driver.
+ * The following routines are optional and filled with a
+ * null pointer if not defined.
+ *
+ * @name: Its the name of registered Inline tls device
+ * @dev_list: Inline tls device list
+ * int (*feature)(struct tls_device *device);
+ *     Called to return Inline TLS driver capability
+ *
+ * int (*hash)(struct tls_device *device, struct sock *sk);
+ *     This function sets Inline driver for listen and program
+ *     device specific functioanlity as required
+ *
+ * void (*unhash)(struct tls_device *device, struct sock *sk);
+ *     This function cleans listen state set by Inline TLS driver
+ *
+ * void (*release)(struct kref *kref);
+ *     Release the registered device and allocated resources
+ * @kref: Number of reference to tls_device
+ */
+struct tls_device {
+	char name[TLS_DEVICE_NAME_MAX];
+	struct list_head dev_list;
+	int  (*feature)(struct tls_device *device);
+	int  (*hash)(struct tls_device *device, struct sock *sk);
+	void (*unhash)(struct tls_device *device, struct sock *sk);
+	void (*release)(struct kref *kref);
+	struct kref kref;
+};
+
+void tls_register_device(struct tls_device *device);
+void tls_unregister_device(struct tls_device *device);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index ac88877dcade..a19c6a1e034a 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -42,6 +42,7 @@
 #include <linux/inet_diag.h>
 
 #include <net/tls.h>
+#include <net/tls_toe.h>
 
 MODULE_AUTHOR("Mellanox Technologies");
 MODULE_DESCRIPTION("Transport Layer Security Support");
-- 
cgit v1.2.3-59-g8ed1b


From f21912edd1570818cbcb16bd1da7d7a2b122d66b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 Oct 2019 11:18:55 -0700
Subject: net/tls: rename tls_device to tls_toe_device

Rename struct tls_device to struct tls_toe_device to avoid
confusion with normal, non-TOE offload.

No functional changes.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls.h      |  4 ++--
 drivers/crypto/chelsio/chtls/chtls_main.c | 20 ++++++++++----------
 include/net/tls_toe.h                     | 24 ++++++++++++------------
 net/tls/tls_main.c                        | 14 +++++++-------
 4 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/chelsio/chtls/chtls.h b/drivers/crypto/chelsio/chtls/chtls.h
index e353c42fea91..d2bc655ab931 100644
--- a/drivers/crypto/chelsio/chtls/chtls.h
+++ b/drivers/crypto/chelsio/chtls/chtls.h
@@ -119,7 +119,7 @@ struct tls_scmd {
 };
 
 struct chtls_dev {
-	struct tls_device tlsdev;
+	struct tls_toe_device tlsdev;
 	struct list_head list;
 	struct cxgb4_lld_info *lldi;
 	struct pci_dev *pdev;
@@ -363,7 +363,7 @@ enum {
 #define TCP_PAGE(sk)   (sk->sk_frag.page)
 #define TCP_OFF(sk)    (sk->sk_frag.offset)
 
-static inline struct chtls_dev *to_chtls_dev(struct tls_device *tlsdev)
+static inline struct chtls_dev *to_chtls_dev(struct tls_toe_device *tlsdev)
 {
 	return container_of(tlsdev, struct chtls_dev, tlsdev);
 }
diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c
index e6df5b95ed47..18996935d8ba 100644
--- a/drivers/crypto/chelsio/chtls/chtls_main.c
+++ b/drivers/crypto/chelsio/chtls/chtls_main.c
@@ -124,7 +124,7 @@ static void chtls_stop_listen(struct chtls_dev *cdev, struct sock *sk)
 	mutex_unlock(&notify_mutex);
 }
 
-static int chtls_inline_feature(struct tls_device *dev)
+static int chtls_inline_feature(struct tls_toe_device *dev)
 {
 	struct net_device *netdev;
 	struct chtls_dev *cdev;
@@ -140,7 +140,7 @@ static int chtls_inline_feature(struct tls_device *dev)
 	return 0;
 }
 
-static int chtls_create_hash(struct tls_device *dev, struct sock *sk)
+static int chtls_create_hash(struct tls_toe_device *dev, struct sock *sk)
 {
 	struct chtls_dev *cdev = to_chtls_dev(dev);
 
@@ -149,7 +149,7 @@ static int chtls_create_hash(struct tls_device *dev, struct sock *sk)
 	return 0;
 }
 
-static void chtls_destroy_hash(struct tls_device *dev, struct sock *sk)
+static void chtls_destroy_hash(struct tls_toe_device *dev, struct sock *sk)
 {
 	struct chtls_dev *cdev = to_chtls_dev(dev);
 
@@ -161,7 +161,7 @@ static void chtls_free_uld(struct chtls_dev *cdev)
 {
 	int i;
 
-	tls_unregister_device(&cdev->tlsdev);
+	tls_toe_unregister_device(&cdev->tlsdev);
 	kvfree(cdev->kmap.addr);
 	idr_destroy(&cdev->hwtid_idr);
 	for (i = 0; i < (1 << RSPQ_HASH_BITS); i++)
@@ -173,27 +173,27 @@ static void chtls_free_uld(struct chtls_dev *cdev)
 
 static inline void chtls_dev_release(struct kref *kref)
 {
+	struct tls_toe_device *dev;
 	struct chtls_dev *cdev;
-	struct tls_device *dev;
 
-	dev = container_of(kref, struct tls_device, kref);
+	dev = container_of(kref, struct tls_toe_device, kref);
 	cdev = to_chtls_dev(dev);
 	chtls_free_uld(cdev);
 }
 
 static void chtls_register_dev(struct chtls_dev *cdev)
 {
-	struct tls_device *tlsdev = &cdev->tlsdev;
+	struct tls_toe_device *tlsdev = &cdev->tlsdev;
 
-	strlcpy(tlsdev->name, "chtls", TLS_DEVICE_NAME_MAX);
+	strlcpy(tlsdev->name, "chtls", TLS_TOE_DEVICE_NAME_MAX);
 	strlcat(tlsdev->name, cdev->lldi->ports[0]->name,
-		TLS_DEVICE_NAME_MAX);
+		TLS_TOE_DEVICE_NAME_MAX);
 	tlsdev->feature = chtls_inline_feature;
 	tlsdev->hash = chtls_create_hash;
 	tlsdev->unhash = chtls_destroy_hash;
 	tlsdev->release = chtls_dev_release;
 	kref_init(&tlsdev->kref);
-	tls_register_device(tlsdev);
+	tls_toe_register_device(tlsdev);
 	cdev->cdev_state = CHTLS_CDEV_STATE_UP;
 }
 
diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h
index 81b66c76b31f..b56d30a5bd6d 100644
--- a/include/net/tls_toe.h
+++ b/include/net/tls_toe.h
@@ -36,7 +36,7 @@
 
 struct sock;
 
-#define TLS_DEVICE_NAME_MAX		32
+#define TLS_TOE_DEVICE_NAME_MAX		32
 
 /*
  * This structure defines the routines for Inline TLS driver.
@@ -45,29 +45,29 @@ struct sock;
  *
  * @name: Its the name of registered Inline tls device
  * @dev_list: Inline tls device list
- * int (*feature)(struct tls_device *device);
+ * int (*feature)(struct tls_toe_device *device);
  *     Called to return Inline TLS driver capability
  *
- * int (*hash)(struct tls_device *device, struct sock *sk);
+ * int (*hash)(struct tls_toe_device *device, struct sock *sk);
  *     This function sets Inline driver for listen and program
  *     device specific functioanlity as required
  *
- * void (*unhash)(struct tls_device *device, struct sock *sk);
+ * void (*unhash)(struct tls_toe_device *device, struct sock *sk);
  *     This function cleans listen state set by Inline TLS driver
  *
  * void (*release)(struct kref *kref);
  *     Release the registered device and allocated resources
- * @kref: Number of reference to tls_device
+ * @kref: Number of reference to tls_toe_device
  */
-struct tls_device {
-	char name[TLS_DEVICE_NAME_MAX];
+struct tls_toe_device {
+	char name[TLS_TOE_DEVICE_NAME_MAX];
 	struct list_head dev_list;
-	int  (*feature)(struct tls_device *device);
-	int  (*hash)(struct tls_device *device, struct sock *sk);
-	void (*unhash)(struct tls_device *device, struct sock *sk);
+	int  (*feature)(struct tls_toe_device *device);
+	int  (*hash)(struct tls_toe_device *device, struct sock *sk);
+	void (*unhash)(struct tls_toe_device *device, struct sock *sk);
 	void (*release)(struct kref *kref);
 	struct kref kref;
 };
 
-void tls_register_device(struct tls_device *device);
-void tls_unregister_device(struct tls_device *device);
+void tls_toe_register_device(struct tls_toe_device *device);
+void tls_toe_unregister_device(struct tls_toe_device *device);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index a19c6a1e034a..a1203807a3ef 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -657,8 +657,8 @@ static void tls_hw_sk_destruct(struct sock *sk)
 
 static int tls_hw_prot(struct sock *sk)
 {
+	struct tls_toe_device *dev;
 	struct tls_context *ctx;
-	struct tls_device *dev;
 	int rc = 0;
 
 	spin_lock_bh(&device_spinlock);
@@ -688,7 +688,7 @@ out:
 static void tls_hw_unhash(struct sock *sk)
 {
 	struct tls_context *ctx = tls_get_ctx(sk);
-	struct tls_device *dev;
+	struct tls_toe_device *dev;
 
 	spin_lock_bh(&device_spinlock);
 	list_for_each_entry(dev, &device_list, dev_list) {
@@ -707,7 +707,7 @@ static void tls_hw_unhash(struct sock *sk)
 static int tls_hw_hash(struct sock *sk)
 {
 	struct tls_context *ctx = tls_get_ctx(sk);
-	struct tls_device *dev;
+	struct tls_toe_device *dev;
 	int err;
 
 	err = ctx->sk_proto->hash(sk);
@@ -878,21 +878,21 @@ static size_t tls_get_info_size(const struct sock *sk)
 	return size;
 }
 
-void tls_register_device(struct tls_device *device)
+void tls_toe_register_device(struct tls_toe_device *device)
 {
 	spin_lock_bh(&device_spinlock);
 	list_add_tail(&device->dev_list, &device_list);
 	spin_unlock_bh(&device_spinlock);
 }
-EXPORT_SYMBOL(tls_register_device);
+EXPORT_SYMBOL(tls_toe_register_device);
 
-void tls_unregister_device(struct tls_device *device)
+void tls_toe_unregister_device(struct tls_toe_device *device)
 {
 	spin_lock_bh(&device_spinlock);
 	list_del(&device->dev_list);
 	spin_unlock_bh(&device_spinlock);
 }
-EXPORT_SYMBOL(tls_unregister_device);
+EXPORT_SYMBOL(tls_toe_unregister_device);
 
 static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
-- 
cgit v1.2.3-59-g8ed1b


From 08700dab816847d5e600ef263155fb04ea4b312d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 Oct 2019 11:18:57 -0700
Subject: net/tls: move TOE-related code to a separate file

Move tls_hw_* functions to a new, separate source file
to avoid confusion with normal, non-TOE offload.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h     |   3 ++
 include/net/tls_toe.h |   4 ++
 net/tls/Makefile      |   2 +-
 net/tls/tls_main.c    | 105 ++------------------------------------
 net/tls/tls_toe.c     | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 150 insertions(+), 103 deletions(-)
 create mode 100644 net/tls/tls_toe.c

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 57865c944095..5c48cb9e0c18 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -308,7 +308,10 @@ struct tls_offload_context_rx {
 #define TLS_OFFLOAD_CONTEXT_SIZE_RX					\
 	(sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX)
 
+struct tls_context *tls_ctx_create(struct sock *sk);
 void tls_ctx_free(struct sock *sk, struct tls_context *ctx);
+void update_sk_prot(struct sock *sk, struct tls_context *ctx);
+
 int wait_on_pending_writer(struct sock *sk, long *timeo);
 int tls_sk_query(struct sock *sk, int optname, char __user *optval,
 		int __user *optlen);
diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h
index b56d30a5bd6d..3bb39c795aed 100644
--- a/include/net/tls_toe.h
+++ b/include/net/tls_toe.h
@@ -69,5 +69,9 @@ struct tls_toe_device {
 	struct kref kref;
 };
 
+int tls_hw_prot(struct sock *sk);
+int tls_hw_hash(struct sock *sk);
+void tls_hw_unhash(struct sock *sk);
+
 void tls_toe_register_device(struct tls_toe_device *device);
 void tls_toe_unregister_device(struct tls_toe_device *device);
diff --git a/net/tls/Makefile b/net/tls/Makefile
index ef0dc74ce8f9..322250e912db 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -5,6 +5,6 @@
 
 obj-$(CONFIG_TLS) += tls.o
 
-tls-y := tls_main.o tls_sw.o
+tls-y := tls_main.o tls_sw.o tls_toe.o
 
 tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 7bc2ad26316f..9d0cf14b2f7e 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -59,14 +59,12 @@ static struct proto *saved_tcpv6_prot;
 static DEFINE_MUTEX(tcpv6_prot_mutex);
 static struct proto *saved_tcpv4_prot;
 static DEFINE_MUTEX(tcpv4_prot_mutex);
-static LIST_HEAD(device_list);
-static DEFINE_SPINLOCK(device_spinlock);
 static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
 static struct proto_ops tls_sw_proto_ops;
 static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 			 struct proto *base);
 
-static void update_sk_prot(struct sock *sk, struct tls_context *ctx)
+void update_sk_prot(struct sock *sk, struct tls_context *ctx)
 {
 	int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
 
@@ -604,7 +602,7 @@ static int tls_setsockopt(struct sock *sk, int level, int optname,
 	return do_tls_setsockopt(sk, optname, optval, optlen);
 }
 
-static struct tls_context *create_ctx(struct sock *sk)
+struct tls_context *tls_ctx_create(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tls_context *ctx;
@@ -644,87 +642,6 @@ static void tls_build_proto(struct sock *sk)
 	}
 }
 
-static void tls_hw_sk_destruct(struct sock *sk)
-{
-	struct tls_context *ctx = tls_get_ctx(sk);
-	struct inet_connection_sock *icsk = inet_csk(sk);
-
-	ctx->sk_destruct(sk);
-	/* Free ctx */
-	rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
-	tls_ctx_free(sk, ctx);
-}
-
-static int tls_hw_prot(struct sock *sk)
-{
-	struct tls_toe_device *dev;
-	struct tls_context *ctx;
-	int rc = 0;
-
-	spin_lock_bh(&device_spinlock);
-	list_for_each_entry(dev, &device_list, dev_list) {
-		if (dev->feature && dev->feature(dev)) {
-			ctx = create_ctx(sk);
-			if (!ctx)
-				goto out;
-
-			ctx->sk_destruct = sk->sk_destruct;
-			sk->sk_destruct = tls_hw_sk_destruct;
-			ctx->rx_conf = TLS_HW_RECORD;
-			ctx->tx_conf = TLS_HW_RECORD;
-			update_sk_prot(sk, ctx);
-			rc = 1;
-			break;
-		}
-	}
-out:
-	spin_unlock_bh(&device_spinlock);
-	return rc;
-}
-
-static void tls_hw_unhash(struct sock *sk)
-{
-	struct tls_context *ctx = tls_get_ctx(sk);
-	struct tls_toe_device *dev;
-
-	spin_lock_bh(&device_spinlock);
-	list_for_each_entry(dev, &device_list, dev_list) {
-		if (dev->unhash) {
-			kref_get(&dev->kref);
-			spin_unlock_bh(&device_spinlock);
-			dev->unhash(dev, sk);
-			kref_put(&dev->kref, dev->release);
-			spin_lock_bh(&device_spinlock);
-		}
-	}
-	spin_unlock_bh(&device_spinlock);
-	ctx->sk_proto->unhash(sk);
-}
-
-static int tls_hw_hash(struct sock *sk)
-{
-	struct tls_context *ctx = tls_get_ctx(sk);
-	struct tls_toe_device *dev;
-	int err;
-
-	err = ctx->sk_proto->hash(sk);
-	spin_lock_bh(&device_spinlock);
-	list_for_each_entry(dev, &device_list, dev_list) {
-		if (dev->hash) {
-			kref_get(&dev->kref);
-			spin_unlock_bh(&device_spinlock);
-			err |= dev->hash(dev, sk);
-			kref_put(&dev->kref, dev->release);
-			spin_lock_bh(&device_spinlock);
-		}
-	}
-	spin_unlock_bh(&device_spinlock);
-
-	if (err)
-		tls_hw_unhash(sk);
-	return err;
-}
-
 static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 			 struct proto *base)
 {
@@ -789,7 +706,7 @@ static int tls_init(struct sock *sk)
 
 	/* allocate tls context */
 	write_lock_bh(&sk->sk_callback_lock);
-	ctx = create_ctx(sk);
+	ctx = tls_ctx_create(sk);
 	if (!ctx) {
 		rc = -ENOMEM;
 		goto out;
@@ -875,22 +792,6 @@ static size_t tls_get_info_size(const struct sock *sk)
 	return size;
 }
 
-void tls_toe_register_device(struct tls_toe_device *device)
-{
-	spin_lock_bh(&device_spinlock);
-	list_add_tail(&device->dev_list, &device_list);
-	spin_unlock_bh(&device_spinlock);
-}
-EXPORT_SYMBOL(tls_toe_register_device);
-
-void tls_toe_unregister_device(struct tls_toe_device *device)
-{
-	spin_lock_bh(&device_spinlock);
-	list_del(&device->dev_list);
-	spin_unlock_bh(&device_spinlock);
-}
-EXPORT_SYMBOL(tls_toe_unregister_device);
-
 static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
 	.owner			= THIS_MODULE,
diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c
new file mode 100644
index 000000000000..89a7014a05f7
--- /dev/null
+++ b/net/tls/tls_toe.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <net/inet_connection_sock.h>
+#include <net/tls.h>
+#include <net/tls_toe.h>
+
+static LIST_HEAD(device_list);
+static DEFINE_SPINLOCK(device_spinlock);
+
+static void tls_hw_sk_destruct(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tls_context *ctx = tls_get_ctx(sk);
+
+	ctx->sk_destruct(sk);
+	/* Free ctx */
+	rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
+	tls_ctx_free(sk, ctx);
+}
+
+int tls_hw_prot(struct sock *sk)
+{
+	struct tls_toe_device *dev;
+	struct tls_context *ctx;
+	int rc = 0;
+
+	spin_lock_bh(&device_spinlock);
+	list_for_each_entry(dev, &device_list, dev_list) {
+		if (dev->feature && dev->feature(dev)) {
+			ctx = tls_ctx_create(sk);
+			if (!ctx)
+				goto out;
+
+			ctx->sk_destruct = sk->sk_destruct;
+			sk->sk_destruct = tls_hw_sk_destruct;
+			ctx->rx_conf = TLS_HW_RECORD;
+			ctx->tx_conf = TLS_HW_RECORD;
+			update_sk_prot(sk, ctx);
+			rc = 1;
+			break;
+		}
+	}
+out:
+	spin_unlock_bh(&device_spinlock);
+	return rc;
+}
+
+void tls_hw_unhash(struct sock *sk)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	struct tls_toe_device *dev;
+
+	spin_lock_bh(&device_spinlock);
+	list_for_each_entry(dev, &device_list, dev_list) {
+		if (dev->unhash) {
+			kref_get(&dev->kref);
+			spin_unlock_bh(&device_spinlock);
+			dev->unhash(dev, sk);
+			kref_put(&dev->kref, dev->release);
+			spin_lock_bh(&device_spinlock);
+		}
+	}
+	spin_unlock_bh(&device_spinlock);
+	ctx->sk_proto->unhash(sk);
+}
+
+int tls_hw_hash(struct sock *sk)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	struct tls_toe_device *dev;
+	int err;
+
+	err = ctx->sk_proto->hash(sk);
+	spin_lock_bh(&device_spinlock);
+	list_for_each_entry(dev, &device_list, dev_list) {
+		if (dev->hash) {
+			kref_get(&dev->kref);
+			spin_unlock_bh(&device_spinlock);
+			err |= dev->hash(dev, sk);
+			kref_put(&dev->kref, dev->release);
+			spin_lock_bh(&device_spinlock);
+		}
+	}
+	spin_unlock_bh(&device_spinlock);
+
+	if (err)
+		tls_hw_unhash(sk);
+	return err;
+}
+
+void tls_toe_register_device(struct tls_toe_device *device)
+{
+	spin_lock_bh(&device_spinlock);
+	list_add_tail(&device->dev_list, &device_list);
+	spin_unlock_bh(&device_spinlock);
+}
+EXPORT_SYMBOL(tls_toe_register_device);
+
+void tls_toe_unregister_device(struct tls_toe_device *device)
+{
+	spin_lock_bh(&device_spinlock);
+	list_del(&device->dev_list);
+	spin_unlock_bh(&device_spinlock);
+}
+EXPORT_SYMBOL(tls_toe_unregister_device);
-- 
cgit v1.2.3-59-g8ed1b


From 0eb8745e03c9ed2a7412c7a844ebc4f0e4f80de4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 Oct 2019 11:18:58 -0700
Subject: net/tls: rename tls_hw_* functions tls_toe_*

The tls_hw_* functions are quite confusingly named, since they
are related to the TOE-offload, not TLS_HW offload which doesn't
require TOE. Rename them.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls_toe.h |  6 +++---
 net/tls/tls_main.c    |  6 +++---
 net/tls/tls_toe.c     | 12 ++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/tls_toe.h b/include/net/tls_toe.h
index 3bb39c795aed..b3aa7593ce2c 100644
--- a/include/net/tls_toe.h
+++ b/include/net/tls_toe.h
@@ -69,9 +69,9 @@ struct tls_toe_device {
 	struct kref kref;
 };
 
-int tls_hw_prot(struct sock *sk);
-int tls_hw_hash(struct sock *sk);
-void tls_hw_unhash(struct sock *sk);
+int tls_toe_bypass(struct sock *sk);
+int tls_toe_hash(struct sock *sk);
+void tls_toe_unhash(struct sock *sk);
 
 void tls_toe_register_device(struct tls_toe_device *device);
 void tls_toe_unregister_device(struct tls_toe_device *device);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 9d0cf14b2f7e..483dda6c3155 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -681,8 +681,8 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
 #endif
 
 	prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
-	prot[TLS_HW_RECORD][TLS_HW_RECORD].hash		= tls_hw_hash;
-	prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash	= tls_hw_unhash;
+	prot[TLS_HW_RECORD][TLS_HW_RECORD].hash		= tls_toe_hash;
+	prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash	= tls_toe_unhash;
 }
 
 static int tls_init(struct sock *sk)
@@ -692,7 +692,7 @@ static int tls_init(struct sock *sk)
 
 	tls_build_proto(sk);
 
-	if (tls_hw_prot(sk))
+	if (tls_toe_bypass(sk))
 		return 0;
 
 	/* The TLS ulp is currently supported only for TCP sockets
diff --git a/net/tls/tls_toe.c b/net/tls/tls_toe.c
index 89a7014a05f7..7e1330f19165 100644
--- a/net/tls/tls_toe.c
+++ b/net/tls/tls_toe.c
@@ -41,7 +41,7 @@
 static LIST_HEAD(device_list);
 static DEFINE_SPINLOCK(device_spinlock);
 
-static void tls_hw_sk_destruct(struct sock *sk)
+static void tls_toe_sk_destruct(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tls_context *ctx = tls_get_ctx(sk);
@@ -52,7 +52,7 @@ static void tls_hw_sk_destruct(struct sock *sk)
 	tls_ctx_free(sk, ctx);
 }
 
-int tls_hw_prot(struct sock *sk)
+int tls_toe_bypass(struct sock *sk)
 {
 	struct tls_toe_device *dev;
 	struct tls_context *ctx;
@@ -66,7 +66,7 @@ int tls_hw_prot(struct sock *sk)
 				goto out;
 
 			ctx->sk_destruct = sk->sk_destruct;
-			sk->sk_destruct = tls_hw_sk_destruct;
+			sk->sk_destruct = tls_toe_sk_destruct;
 			ctx->rx_conf = TLS_HW_RECORD;
 			ctx->tx_conf = TLS_HW_RECORD;
 			update_sk_prot(sk, ctx);
@@ -79,7 +79,7 @@ out:
 	return rc;
 }
 
-void tls_hw_unhash(struct sock *sk)
+void tls_toe_unhash(struct sock *sk)
 {
 	struct tls_context *ctx = tls_get_ctx(sk);
 	struct tls_toe_device *dev;
@@ -98,7 +98,7 @@ void tls_hw_unhash(struct sock *sk)
 	ctx->sk_proto->unhash(sk);
 }
 
-int tls_hw_hash(struct sock *sk)
+int tls_toe_hash(struct sock *sk)
 {
 	struct tls_context *ctx = tls_get_ctx(sk);
 	struct tls_toe_device *dev;
@@ -118,7 +118,7 @@ int tls_hw_hash(struct sock *sk)
 	spin_unlock_bh(&device_spinlock);
 
 	if (err)
-		tls_hw_unhash(sk);
+		tls_toe_unhash(sk);
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d6547f2a2cfc8b145b59291d3e4b072891f34882 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 3 Oct 2019 23:29:24 +0300
Subject: net, uapi: fix -Wpointer-arith warnings

Add casts to fix these warnings:

./usr/include/linux/netfilter_arp/arp_tables.h:200:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith]
./usr/include/linux/netfilter_bridge/ebtables.h:197:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith]
./usr/include/linux/netfilter_ipv4/ip_tables.h:223:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith]
./usr/include/linux/netfilter_ipv6/ip6_tables.h:263:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith]
./usr/include/linux/tipc_config.h:310:28: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith]
./usr/include/linux/tipc_config.h:410:24: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith]
./usr/include/linux/virtio_ring.h:170:16: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith]

Those are theoretical probably but kernel doesn't control compiler flags
in userspace.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netfilter_arp/arp_tables.h  | 2 +-
 include/uapi/linux/netfilter_bridge/ebtables.h | 2 +-
 include/uapi/linux/netfilter_ipv4/ip_tables.h  | 2 +-
 include/uapi/linux/netfilter_ipv6/ip6_tables.h | 2 +-
 include/uapi/linux/tipc_config.h               | 4 ++--
 include/uapi/linux/virtio_ring.h               | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h
index a2a0927d9bd6..bbf5af2b67a8 100644
--- a/include/uapi/linux/netfilter_arp/arp_tables.h
+++ b/include/uapi/linux/netfilter_arp/arp_tables.h
@@ -199,7 +199,7 @@ struct arpt_get_entries {
 /* Helper functions */
 static __inline__ struct xt_entry_target *arpt_get_target(struct arpt_entry *e)
 {
-	return (void *)e + e->target_offset;
+	return (struct xt_entry_target *)((char *)e + e->target_offset);
 }
 
 /*
diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h
index 8076c940ffeb..a494cf43a755 100644
--- a/include/uapi/linux/netfilter_bridge/ebtables.h
+++ b/include/uapi/linux/netfilter_bridge/ebtables.h
@@ -194,7 +194,7 @@ struct ebt_entry {
 static __inline__ struct ebt_entry_target *
 ebt_get_target(struct ebt_entry *e)
 {
-	return (void *)e + e->target_offset;
+	return (struct ebt_entry_target *)((char *)e + e->target_offset);
 }
 
 /* {g,s}etsockopt numbers */
diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h
index 6aaeb14bfce1..50c7fee625ae 100644
--- a/include/uapi/linux/netfilter_ipv4/ip_tables.h
+++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h
@@ -222,7 +222,7 @@ struct ipt_get_entries {
 static __inline__ struct xt_entry_target *
 ipt_get_target(struct ipt_entry *e)
 {
-	return (void *)e + e->target_offset;
+	return (struct xt_entry_target *)((char *)e + e->target_offset);
 }
 
 /*
diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h
index 031d0a43bed2..d9e364f96a5c 100644
--- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h
@@ -262,7 +262,7 @@ struct ip6t_get_entries {
 static __inline__ struct xt_entry_target *
 ip6t_get_target(struct ip6t_entry *e)
 {
-	return (void *)e + e->target_offset;
+	return (struct xt_entry_target *)((char *)e + e->target_offset);
 }
 
 /*
diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h
index 4955e1a9f1bc..4dfc05651c98 100644
--- a/include/uapi/linux/tipc_config.h
+++ b/include/uapi/linux/tipc_config.h
@@ -309,7 +309,7 @@ static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len)
 	tlv_ptr->tlv_len  = htons(tlv_len);
 	if (len && data) {
 		memcpy(TLV_DATA(tlv_ptr), data, len);
-		memset(TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len);
+		memset((char *)TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len);
 	}
 	return TLV_SPACE(len);
 }
@@ -409,7 +409,7 @@ static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags,
 	tcm_hdr->tcm_flags = htons(flags);
 	if (data_len && data) {
 		memcpy(TCM_DATA(msg), data, data_len);
-		memset(TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len);
+		memset((char *)TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len);
 	}
 	return TCM_SPACE(data_len);
 }
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 4c4e24c291a5..559f42e73315 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -169,7 +169,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p,
 {
 	vr->num = num;
 	vr->desc = p;
-	vr->avail = p + num*sizeof(struct vring_desc);
+	vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc));
 	vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16)
 		+ align-1) & ~(align - 1));
 }
-- 
cgit v1.2.3-59-g8ed1b


From 193d357d087309f2d5ab8e8caab1af5e3bc29fa0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 3 Oct 2019 23:56:37 +0300
Subject: net: spread "enum sock_flags"

Some ints are "enum sock_flags" in fact.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 net/core/sock.c    | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2c53f1a1d905..ab905c4b1f0e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2512,7 +2512,7 @@ static inline bool sk_listener(const struct sock *sk)
 	return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
 }
 
-void sock_enable_timestamp(struct sock *sk, int flag);
+void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
 		       int type);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 07863edbe6fc..9774ab2ed3f1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -687,7 +687,8 @@ out:
 	return ret;
 }
 
-static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
+				     int valbool)
 {
 	if (valbool)
 		sock_set_flag(sk, bit);
@@ -3033,7 +3034,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp,
 }
 EXPORT_SYMBOL(sock_gettstamp);
 
-void sock_enable_timestamp(struct sock *sk, int flag)
+void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
 {
 	if (!sock_flag(sk, flag)) {
 		unsigned long previous_flags = sk->sk_flags;
-- 
cgit v1.2.3-59-g8ed1b


From 8538d29cea9530f114159e06bfa31b2358161493 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 4 Oct 2019 16:19:22 -0700
Subject: net/tls: add tracing for device/offload events

Add tracing of device-related interaction to aid performance
analysis, especially around resync:

 tls:tls_device_offload_set
 tls:tls_device_rx_resync_send
 tls:tls_device_rx_resync_nh_schedule
 tls:tls_device_rx_resync_nh_delay
 tls:tls_device_tx_resync_req
 tls:tls_device_tx_resync_send

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |   3 +-
 include/net/tls.h                                  |   8 +-
 net/tls/Makefile                                   |   4 +-
 net/tls/tls_device.c                               |  30 +++-
 net/tls/trace.c                                    |  10 ++
 net/tls/trace.h                                    | 169 +++++++++++++++++++++
 6 files changed, 213 insertions(+), 11 deletions(-)
 create mode 100644 net/tls/trace.c
 create mode 100644 net/tls/trace.h

(limited to 'include')

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 61aabffc8888..bcdcd6de7dea 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -872,7 +872,8 @@ nfp_net_tls_tx(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec,
 
 		/* jump forward, a TX may have gotten lost, need to sync TX */
 		if (!resync_pending && seq - ntls->next_seq < U32_MAX / 4)
-			tls_offload_tx_resync_request(nskb->sk);
+			tls_offload_tx_resync_request(nskb->sk, seq,
+						      ntls->next_seq);
 
 		*nr_frags = 0;
 		return nskb;
diff --git a/include/net/tls.h b/include/net/tls.h
index 5c48cb9e0c18..38086ade65ce 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -594,13 +594,6 @@ tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type)
 	tls_offload_ctx_rx(tls_ctx)->resync_type = type;
 }
 
-static inline void tls_offload_tx_resync_request(struct sock *sk)
-{
-	struct tls_context *tls_ctx = tls_get_ctx(sk);
-
-	WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags));
-}
-
 /* Driver's seq tracking has to be disabled until resync succeeded */
 static inline bool tls_offload_tx_resync_pending(struct sock *sk)
 {
@@ -634,6 +627,7 @@ void tls_device_free_resources_tx(struct sock *sk);
 int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx);
 void tls_device_offload_cleanup_rx(struct sock *sk);
 void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq);
+void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq);
 int tls_device_decrypted(struct sock *sk, struct sk_buff *skb);
 #else
 static inline void tls_device_init(void) {}
diff --git a/net/tls/Makefile b/net/tls/Makefile
index 95d8c06a14b9..0606d43d7582 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -3,9 +3,11 @@
 # Makefile for the TLS subsystem.
 #
 
+CFLAGS_trace.o := -I$(src)
+
 obj-$(CONFIG_TLS) += tls.o
 
-tls-y := tls_main.o tls_sw.o
+tls-y := tls_main.o tls_sw.o trace.o
 
 tls-$(CONFIG_TLS_TOE) += tls_toe.o
 tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index f959487c5cd1..9f423caf48e3 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -38,6 +38,8 @@
 #include <net/tcp.h>
 #include <net/tls.h>
 
+#include "trace.h"
+
 /* device_offload_lock is used to synchronize tls_dev_add
  * against NETDEV_DOWN notifications.
  */
@@ -202,6 +204,15 @@ void tls_device_free_resources_tx(struct sock *sk)
 	tls_free_partial_record(sk, tls_ctx);
 }
 
+void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+
+	trace_tls_device_tx_resync_req(sk, got_seq, exp_seq);
+	WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags));
+}
+EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request);
+
 static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
 				 u32 seq)
 {
@@ -216,6 +227,7 @@ static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
 
 	rcd_sn = tls_ctx->tx.rec_seq;
 
+	trace_tls_device_tx_resync_send(sk, seq, rcd_sn);
 	down_read(&device_offload_lock);
 	netdev = tls_ctx->netdev;
 	if (netdev)
@@ -637,10 +649,13 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
 static void tls_device_resync_rx(struct tls_context *tls_ctx,
 				 struct sock *sk, u32 seq, u8 *rcd_sn)
 {
+	struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
 	struct net_device *netdev;
 
 	if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags)))
 		return;
+
+	trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type);
 	netdev = READ_ONCE(tls_ctx->netdev);
 	if (netdev)
 		netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
@@ -653,8 +668,8 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_rx *rx_ctx;
 	u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
+	u32 sock_data, is_req_pending;
 	struct tls_prot_info *prot;
-	u32 is_req_pending;
 	s64 resync_req;
 	u32 req_seq;
 
@@ -683,8 +698,12 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
 		/* head of next rec is already in, note that the sock_inq will
 		 * include the currently parsed message when called from parser
 		 */
-		if (tcp_inq(sk) > rcd_len)
+		sock_data = tcp_inq(sk);
+		if (sock_data > rcd_len) {
+			trace_tls_device_rx_resync_nh_delay(sk, sock_data,
+							    rcd_len);
 			return;
+		}
 
 		rx_ctx->resync_nh_do_now = 0;
 		seq += rcd_len;
@@ -728,6 +747,7 @@ static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx,
 
 	/* head of next rec is already in, parser will sync for us */
 	if (tcp_inq(sk) > rxm->full_len) {
+		trace_tls_device_rx_resync_nh_schedule(sk);
 		ctx->resync_nh_do_now = 1;
 	} else {
 		struct tls_prot_info *prot = &tls_ctx->prot_info;
@@ -1013,6 +1033,8 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
 	rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
 					     &ctx->crypto_send.info,
 					     tcp_sk(sk)->write_seq);
+	trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX,
+				     tcp_sk(sk)->write_seq, rec_seq, rc);
 	if (rc)
 		goto release_lock;
 
@@ -1049,6 +1071,7 @@ free_marker_record:
 
 int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
 {
+	struct tls12_crypto_info_aes_gcm_128 *info;
 	struct tls_offload_context_rx *context;
 	struct net_device *netdev;
 	int rc = 0;
@@ -1096,6 +1119,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
 	rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX,
 					     &ctx->crypto_recv.info,
 					     tcp_sk(sk)->copied_seq);
+	info = (void *)&ctx->crypto_recv.info;
+	trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX,
+				     tcp_sk(sk)->copied_seq, info->rec_seq, rc);
 	if (rc)
 		goto free_sw_resources;
 
diff --git a/net/tls/trace.c b/net/tls/trace.c
new file mode 100644
index 000000000000..e374913cf9c9
--- /dev/null
+++ b/net/tls/trace.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/module.h>
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#endif
diff --git a/net/tls/trace.h b/net/tls/trace.h
new file mode 100644
index 000000000000..95b6ded2f9b2
--- /dev/null
+++ b/net/tls/trace.h
@@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM tls
+
+#if !defined(_TLS_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _TLS_TRACE_H_
+
+#include <asm/unaligned.h>
+#include <linux/tracepoint.h>
+
+struct sock;
+
+TRACE_EVENT(tls_device_offload_set,
+
+	TP_PROTO(struct sock *sk, int dir, u32 tcp_seq, u8 *rec_no, int ret),
+
+	TP_ARGS(sk, dir, tcp_seq, rec_no, ret),
+
+	TP_STRUCT__entry(
+		__field(	struct sock *,	sk		)
+		__field(	u64,		rec_no		)
+		__field(	int,		dir		)
+		__field(	u32,		tcp_seq		)
+		__field(	int,		ret		)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+		__entry->rec_no = get_unaligned_be64(rec_no);
+		__entry->dir = dir;
+		__entry->tcp_seq = tcp_seq;
+		__entry->ret = ret;
+	),
+
+	TP_printk(
+		"sk=%p direction=%d tcp_seq=%u rec_no=%llu ret=%d",
+		__entry->sk, __entry->dir, __entry->tcp_seq, __entry->rec_no,
+		__entry->ret
+	)
+);
+
+TRACE_EVENT(tls_device_rx_resync_send,
+
+	TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no, int sync_type),
+
+	TP_ARGS(sk, tcp_seq, rec_no, sync_type),
+
+	TP_STRUCT__entry(
+		__field(	struct sock *,	sk		)
+		__field(	u64,		rec_no		)
+		__field(	u32,		tcp_seq		)
+		__field(	int,		sync_type	)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+		__entry->rec_no = get_unaligned_be64(rec_no);
+		__entry->tcp_seq = tcp_seq;
+		__entry->sync_type = sync_type;
+	),
+
+	TP_printk(
+		"sk=%p tcp_seq=%u rec_no=%llu sync_type=%d",
+		__entry->sk, __entry->tcp_seq, __entry->rec_no,
+		__entry->sync_type
+	)
+);
+
+TRACE_EVENT(tls_device_rx_resync_nh_schedule,
+
+	TP_PROTO(struct sock *sk),
+
+	TP_ARGS(sk),
+
+	TP_STRUCT__entry(
+		__field(	struct sock *,	sk		)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+	),
+
+	TP_printk(
+		"sk=%p", __entry->sk
+	)
+);
+
+TRACE_EVENT(tls_device_rx_resync_nh_delay,
+
+	TP_PROTO(struct sock *sk, u32 sock_data, u32 rec_len),
+
+	TP_ARGS(sk, sock_data, rec_len),
+
+	TP_STRUCT__entry(
+		__field(	struct sock *,	sk		)
+		__field(	u32,		sock_data	)
+		__field(	u32,		rec_len		)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+		__entry->sock_data = sock_data;
+		__entry->rec_len = rec_len;
+	),
+
+	TP_printk(
+		"sk=%p sock_data=%u rec_len=%u",
+		__entry->sk, __entry->sock_data, __entry->rec_len
+	)
+);
+
+TRACE_EVENT(tls_device_tx_resync_req,
+
+	TP_PROTO(struct sock *sk, u32 tcp_seq, u32 exp_tcp_seq),
+
+	TP_ARGS(sk, tcp_seq, exp_tcp_seq),
+
+	TP_STRUCT__entry(
+		__field(	struct sock *,	sk		)
+		__field(	u32,		tcp_seq		)
+		__field(	u32,		exp_tcp_seq	)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+		__entry->tcp_seq = tcp_seq;
+		__entry->exp_tcp_seq = exp_tcp_seq;
+	),
+
+	TP_printk(
+		"sk=%p tcp_seq=%u exp_tcp_seq=%u",
+		__entry->sk, __entry->tcp_seq, __entry->exp_tcp_seq
+	)
+);
+
+TRACE_EVENT(tls_device_tx_resync_send,
+
+	TP_PROTO(struct sock *sk, u32 tcp_seq, u8 *rec_no),
+
+	TP_ARGS(sk, tcp_seq, rec_no),
+
+	TP_STRUCT__entry(
+		__field(	struct sock *,	sk		)
+		__field(	u64,		rec_no		)
+		__field(	u32,		tcp_seq		)
+	),
+
+	TP_fast_assign(
+		__entry->sk = sk;
+		__entry->rec_no = get_unaligned_be64(rec_no);
+		__entry->tcp_seq = tcp_seq;
+	),
+
+	TP_printk(
+		"sk=%p tcp_seq=%u rec_no=%llu",
+		__entry->sk, __entry->tcp_seq, __entry->rec_no
+	)
+);
+
+#endif /* _TLS_TRACE_H_ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
-- 
cgit v1.2.3-59-g8ed1b


From d26b698dd3cd52f5a3277446a87e5e0198c99cd0 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 4 Oct 2019 16:19:24 -0700
Subject: net/tls: add skeleton of MIB statistics

Add a skeleton structure for adding TLS statistics.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tls.rst |  6 ++++++
 include/net/netns/mib.h          |  3 +++
 include/net/snmp.h               |  6 ++++++
 include/net/tls.h                | 13 +++++++++++++
 include/uapi/linux/snmp.h        |  7 +++++++
 net/tls/Makefile                 |  2 +-
 net/tls/tls_main.c               | 37 +++++++++++++++++++++++++++++++++++++
 net/tls/tls_proc.c               | 37 +++++++++++++++++++++++++++++++++++++
 8 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 net/tls/tls_proc.c

(limited to 'include')

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index 5bcbf75e2025..a6ee595630ed 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -213,3 +213,9 @@ A patchset to OpenSSL to use ktls as the record layer is
 of calling send directly after a handshake using gnutls.
 Since it doesn't implement a full record layer, control
 messages are not supported.
+
+Statistics
+==========
+
+TLS implementation exposes the following per-namespace statistics
+(``/proc/net/tls_stat``):
diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h
index 830bdf345b17..b5fdb108d602 100644
--- a/include/net/netns/mib.h
+++ b/include/net/netns/mib.h
@@ -24,6 +24,9 @@ struct netns_mib {
 #ifdef CONFIG_XFRM_STATISTICS
 	DEFINE_SNMP_STAT(struct linux_xfrm_mib, xfrm_statistics);
 #endif
+#if IS_ENABLED(CONFIG_TLS)
+	DEFINE_SNMP_STAT(struct linux_tls_mib, tls_statistics);
+#endif
 };
 
 #endif
diff --git a/include/net/snmp.h b/include/net/snmp.h
index cb8ced4380a6..468a67836e2f 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -111,6 +111,12 @@ struct linux_xfrm_mib {
 	unsigned long	mibs[LINUX_MIB_XFRMMAX];
 };
 
+/* Linux TLS */
+#define LINUX_MIB_TLSMAX	__LINUX_MIB_TLSMAX
+struct linux_tls_mib {
+	unsigned long	mibs[LINUX_MIB_TLSMAX];
+};
+
 #define DEFINE_SNMP_STAT(type, name)	\
 	__typeof__(type) __percpu *name
 #define DEFINE_SNMP_STAT_ATOMIC(type, name)	\
diff --git a/include/net/tls.h b/include/net/tls.h
index 38086ade65ce..24c37bffc961 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -43,6 +43,7 @@
 #include <linux/netdevice.h>
 #include <linux/rcupdate.h>
 
+#include <net/net_namespace.h>
 #include <net/tcp.h>
 #include <net/strparser.h>
 #include <crypto/aead.h>
@@ -73,6 +74,15 @@
  */
 #define TLS_AES_CCM_IV_B0_BYTE		2
 
+#define __TLS_INC_STATS(net, field)				\
+	__SNMP_INC_STATS((net)->mib.tls_statistics, field)
+#define TLS_INC_STATS(net, field)				\
+	SNMP_INC_STATS((net)->mib.tls_statistics, field)
+#define __TLS_DEC_STATS(net, field)				\
+	__SNMP_DEC_STATS((net)->mib.tls_statistics, field)
+#define TLS_DEC_STATS(net, field)				\
+	SNMP_DEC_STATS((net)->mib.tls_statistics, field)
+
 enum {
 	TLS_BASE,
 	TLS_SW,
@@ -605,6 +615,9 @@ static inline bool tls_offload_tx_resync_pending(struct sock *sk)
 	return ret;
 }
 
+int __net_init tls_proc_init(struct net *net);
+void __net_exit tls_proc_fini(struct net *net);
+
 int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
 		      unsigned char *record_type);
 int decrypt_skb(struct sock *sk, struct sk_buff *skb,
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 549a31c29f7d..4abd57948ad4 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -323,4 +323,11 @@ enum
 	__LINUX_MIB_XFRMMAX
 };
 
+/* linux TLS mib definitions */
+enum
+{
+	LINUX_MIB_TLSNUM = 0,
+	__LINUX_MIB_TLSMAX
+};
+
 #endif	/* _LINUX_SNMP_H */
diff --git a/net/tls/Makefile b/net/tls/Makefile
index 0606d43d7582..f1ffbfe8968d 100644
--- a/net/tls/Makefile
+++ b/net/tls/Makefile
@@ -7,7 +7,7 @@ CFLAGS_trace.o := -I$(src)
 
 obj-$(CONFIG_TLS) += tls.o
 
-tls-y := tls_main.o tls_sw.o trace.o
+tls-y := tls_main.o tls_sw.o tls_proc.o trace.o
 
 tls-$(CONFIG_TLS_TOE) += tls_toe.o
 tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 237e58e4928a..686eba0df590 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -41,6 +41,7 @@
 #include <linux/inetdevice.h>
 #include <linux/inet_diag.h>
 
+#include <net/snmp.h>
 #include <net/tls.h>
 #include <net/tls_toe.h>
 
@@ -795,6 +796,35 @@ static size_t tls_get_info_size(const struct sock *sk)
 	return size;
 }
 
+static int __net_init tls_init_net(struct net *net)
+{
+	int err;
+
+	net->mib.tls_statistics = alloc_percpu(struct linux_tls_mib);
+	if (!net->mib.tls_statistics)
+		return -ENOMEM;
+
+	err = tls_proc_init(net);
+	if (err)
+		goto err_free_stats;
+
+	return 0;
+err_free_stats:
+	free_percpu(net->mib.tls_statistics);
+	return err;
+}
+
+static void __net_exit tls_exit_net(struct net *net)
+{
+	tls_proc_fini(net);
+	free_percpu(net->mib.tls_statistics);
+}
+
+static struct pernet_operations tls_proc_ops = {
+	.init = tls_init_net,
+	.exit = tls_exit_net,
+};
+
 static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
 	.owner			= THIS_MODULE,
@@ -806,6 +836,12 @@ static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 
 static int __init tls_register(void)
 {
+	int err;
+
+	err = register_pernet_subsys(&tls_proc_ops);
+	if (err)
+		return err;
+
 	tls_sw_proto_ops = inet_stream_ops;
 	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
 
@@ -819,6 +855,7 @@ static void __exit tls_unregister(void)
 {
 	tcp_unregister_ulp(&tcp_tls_ulp_ops);
 	tls_device_cleanup();
+	unregister_pernet_subsys(&tls_proc_ops);
 }
 
 module_init(tls_register);
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
new file mode 100644
index 000000000000..4ecc7c35d2f7
--- /dev/null
+++ b/net/tls/tls_proc.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/snmp.h>
+#include <net/tls.h>
+
+static const struct snmp_mib tls_mib_list[] = {
+	SNMP_MIB_SENTINEL
+};
+
+static int tls_statistics_seq_show(struct seq_file *seq, void *v)
+{
+	unsigned long buf[LINUX_MIB_TLSMAX] = {};
+	struct net *net = seq->private;
+	int i;
+
+	snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics);
+	for (i = 0; tls_mib_list[i].name; i++)
+		seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]);
+
+	return 0;
+}
+
+int __net_init tls_proc_init(struct net *net)
+{
+	if (!proc_create_net_single("tls_stat", 0444, net->proc_net,
+				    tls_statistics_seq_show, NULL))
+		return -ENOMEM;
+	return 0;
+}
+
+void __net_exit tls_proc_fini(struct net *net)
+{
+	remove_proc_entry("tls_stat", net->proc_net);
+}
-- 
cgit v1.2.3-59-g8ed1b


From b32fd3cc31d723bf2ab859667be3612c0086ec72 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 4 Oct 2019 16:19:25 -0700
Subject: net/tls: add statistics for installed sessions

Add SNMP stats for number of sockets with successfully
installed sessions.  Break them down to software and
hardware ones.  Note that if hardware offload fails
stack uses software implementation, and counts the
session appropriately.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tls.rst | 14 ++++++++++++++
 include/uapi/linux/snmp.h        |  8 ++++++++
 net/tls/tls_main.c               | 23 +++++++++++++++++++----
 net/tls/tls_proc.c               |  8 ++++++++
 4 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index a6ee595630ed..cfba587af5c9 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -219,3 +219,17 @@ Statistics
 
 TLS implementation exposes the following per-namespace statistics
 (``/proc/net/tls_stat``):
+
+- ``TlsCurrTxSw``, ``TlsCurrRxSw`` -
+  number of TX and RX sessions currently installed where host handles
+  cryptography
+
+- ``TlsCurrTxDevice``, ``TlsCurrRxDevice`` -
+  number of TX and RX sessions currently installed where NIC handles
+  cryptography
+
+- ``TlsTxSw``, ``TlsRxSw`` -
+  number of TX and RX sessions opened with host cryptography
+
+- ``TlsTxDevice``, ``TlsRxDevice`` -
+  number of TX and RX sessions opened with NIC cryptography
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 4abd57948ad4..1b4613b5af70 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -327,6 +327,14 @@ enum
 enum
 {
 	LINUX_MIB_TLSNUM = 0,
+	LINUX_MIB_TLSCURRTXSW,			/* TlsCurrTxSw */
+	LINUX_MIB_TLSCURRRXSW,			/* TlsCurrRxSw */
+	LINUX_MIB_TLSCURRTXDEVICE,		/* TlsCurrTxDevice */
+	LINUX_MIB_TLSCURRRXDEVICE,		/* TlsCurrRxDevice */
+	LINUX_MIB_TLSTXSW,			/* TlsTxSw */
+	LINUX_MIB_TLSRXSW,			/* TlsRxSw */
+	LINUX_MIB_TLSTXDEVICE,			/* TlsTxDevice */
+	LINUX_MIB_TLSRXDEVICE,			/* TlsRxDevice */
 	__LINUX_MIB_TLSMAX
 };
 
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 686eba0df590..f144b965704e 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -286,14 +286,19 @@ static void tls_sk_proto_cleanup(struct sock *sk,
 		kfree(ctx->tx.rec_seq);
 		kfree(ctx->tx.iv);
 		tls_sw_release_resources_tx(sk);
+		TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW);
 	} else if (ctx->tx_conf == TLS_HW) {
 		tls_device_free_resources_tx(sk);
+		TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE);
 	}
 
-	if (ctx->rx_conf == TLS_SW)
+	if (ctx->rx_conf == TLS_SW) {
 		tls_sw_release_resources_rx(sk);
-	else if (ctx->rx_conf == TLS_HW)
+		TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW);
+	} else if (ctx->rx_conf == TLS_HW) {
 		tls_device_offload_cleanup_rx(sk);
+		TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE);
+	}
 }
 
 static void tls_sk_proto_close(struct sock *sk, long timeout)
@@ -534,19 +539,29 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
 	if (tx) {
 		rc = tls_set_device_offload(sk, ctx);
 		conf = TLS_HW;
-		if (rc) {
+		if (!rc) {
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE);
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE);
+		} else {
 			rc = tls_set_sw_offload(sk, ctx, 1);
 			if (rc)
 				goto err_crypto_info;
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW);
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW);
 			conf = TLS_SW;
 		}
 	} else {
 		rc = tls_set_device_offload_rx(sk, ctx);
 		conf = TLS_HW;
-		if (rc) {
+		if (!rc) {
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE);
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE);
+		} else {
 			rc = tls_set_sw_offload(sk, ctx, 0);
 			if (rc)
 				goto err_crypto_info;
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW);
+			TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW);
 			conf = TLS_SW;
 		}
 		tls_sw_strparser_arm(sk, ctx);
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 4ecc7c35d2f7..1b1f3783badc 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -7,6 +7,14 @@
 #include <net/tls.h>
 
 static const struct snmp_mib tls_mib_list[] = {
+	SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW),
+	SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW),
+	SNMP_MIB_ITEM("TlsCurrTxDevice", LINUX_MIB_TLSCURRTXDEVICE),
+	SNMP_MIB_ITEM("TlsCurrRxDevice", LINUX_MIB_TLSCURRRXDEVICE),
+	SNMP_MIB_ITEM("TlsTxSw", LINUX_MIB_TLSTXSW),
+	SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW),
+	SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE),
+	SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
 	SNMP_MIB_SENTINEL
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 5c5ec66858062a857cf51f57cbe52b36330f7ae6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 4 Oct 2019 16:19:26 -0700
Subject: net/tls: add TlsDecryptError stat

Add a statistic for TLS record decryption errors.

Since devices are supposed to pass records as-is when they
encounter errors this statistic will count bad records in
both pure software and inline crypto configurations.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tls.rst | 3 +++
 include/uapi/linux/snmp.h        | 1 +
 net/tls/tls_proc.c               | 1 +
 net/tls/tls_sw.c                 | 5 +++++
 4 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index cfba587af5c9..ab82362dd819 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -233,3 +233,6 @@ TLS implementation exposes the following per-namespace statistics
 
 - ``TlsTxDevice``, ``TlsRxDevice`` -
   number of TX and RX sessions opened with NIC cryptography
+
+- ``TlsDecryptError`` -
+  record decryption failed (e.g. due to incorrect authentication tag)
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 1b4613b5af70..c9e4963e26f0 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -335,6 +335,7 @@ enum
 	LINUX_MIB_TLSRXSW,			/* TlsRxSw */
 	LINUX_MIB_TLSTXDEVICE,			/* TlsTxDevice */
 	LINUX_MIB_TLSRXDEVICE,			/* TlsRxDevice */
+	LINUX_MIB_TLSDECRYPTERROR,		/* TlsDecryptError */
 	__LINUX_MIB_TLSMAX
 };
 
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 1b1f3783badc..2bea7ef4823c 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -15,6 +15,7 @@ static const struct snmp_mib tls_mib_list[] = {
 	SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW),
 	SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE),
 	SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
+	SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
 	SNMP_MIB_SENTINEL
 };
 
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index c2b5e0d2ba1a..0b1e86f856eb 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -168,6 +168,9 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err)
 
 	/* Propagate if there was an err */
 	if (err) {
+		if (err == -EBADMSG)
+			TLS_INC_STATS(sock_net(skb->sk),
+				      LINUX_MIB_TLSDECRYPTERROR);
 		ctx->async_wait.err = err;
 		tls_err_abort(skb->sk, err);
 	} else {
@@ -253,6 +256,8 @@ static int tls_do_decryption(struct sock *sk,
 			return ret;
 
 		ret = crypto_wait_req(ret, &ctx->async_wait);
+	} else if (ret == -EBADMSG) {
+		TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR);
 	}
 
 	if (async)
-- 
cgit v1.2.3-59-g8ed1b


From a4d26fdbc2a5414bb1b67198656cc7e24a4a3c3a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 4 Oct 2019 16:19:27 -0700
Subject: net/tls: add TlsDeviceRxResync statistic

Add a statistic for number of RX resyncs sent down to the NIC.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/tls.rst | 3 +++
 include/uapi/linux/snmp.h        | 1 +
 net/tls/tls_device.c             | 1 +
 net/tls/tls_proc.c               | 1 +
 4 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/tls.rst b/Documentation/networking/tls.rst
index ab82362dd819..8cb2cd4e2a80 100644
--- a/Documentation/networking/tls.rst
+++ b/Documentation/networking/tls.rst
@@ -236,3 +236,6 @@ TLS implementation exposes the following per-namespace statistics
 
 - ``TlsDecryptError`` -
   record decryption failed (e.g. due to incorrect authentication tag)
+
+- ``TlsDeviceRxResync`` -
+  number of RX resyncs sent to NICs handling cryptography
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index c9e4963e26f0..7eee233e78d2 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -336,6 +336,7 @@ enum
 	LINUX_MIB_TLSTXDEVICE,			/* TlsTxDevice */
 	LINUX_MIB_TLSRXDEVICE,			/* TlsRxDevice */
 	LINUX_MIB_TLSDECRYPTERROR,		/* TlsDecryptError */
+	LINUX_MIB_TLSRXDEVICERESYNC,		/* TlsRxDeviceResync */
 	__LINUX_MIB_TLSMAX
 };
 
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 5a9a86bf0ee1..f306e4c7bf15 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -661,6 +661,7 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx,
 		netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
 						   TLS_OFFLOAD_CTX_DIR_RX);
 	clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags);
+	TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC);
 }
 
 void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 2bea7ef4823c..83d9c80a684e 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -16,6 +16,7 @@ static const struct snmp_mib tls_mib_list[] = {
 	SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE),
 	SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE),
 	SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR),
+	SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
 	SNMP_MIB_SENTINEL
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 8273fd845447820c26b38821c8ac297f40a65260 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sat, 5 Oct 2019 08:10:31 +0200
Subject: net: devlink: export devlink net setter

For newly allocated devlink instance allow drivers to set net struct

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  2 ++
 net/core/devlink.c    | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 3c9d4a063c98..4095657fc23f 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -39,6 +39,7 @@ struct devlink {
 	possible_net_t _net;
 	struct mutex lock;
 	bool reload_failed;
+	bool registered;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
@@ -772,6 +773,7 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev)
 struct ib_device;
 
 struct net *devlink_net(const struct devlink *devlink);
+void devlink_net_set(struct devlink *devlink, struct net *net);
 struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
 int devlink_register(struct devlink *devlink, struct device *dev);
 void devlink_unregister(struct devlink *devlink);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 0e464d071172..76d835581687 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -101,11 +101,19 @@ struct net *devlink_net(const struct devlink *devlink)
 }
 EXPORT_SYMBOL_GPL(devlink_net);
 
-static void devlink_net_set(struct devlink *devlink, struct net *net)
+static void __devlink_net_set(struct devlink *devlink, struct net *net)
 {
 	write_pnet(&devlink->_net, net);
 }
 
+void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+	if (WARN_ON(devlink->registered))
+		return;
+	__devlink_net_set(devlink, net);
+}
+EXPORT_SYMBOL_GPL(devlink_net_set);
+
 static struct devlink *devlink_get_from_attrs(struct net *net,
 					      struct nlattr **attrs)
 {
@@ -2750,7 +2758,7 @@ static void devlink_reload_netns_change(struct devlink *devlink,
 				     DEVLINK_CMD_PARAM_DEL);
 	devlink_notify(devlink, DEVLINK_CMD_DEL);
 
-	devlink_net_set(devlink, dest_net);
+	__devlink_net_set(devlink, dest_net);
 
 	devlink_notify(devlink, DEVLINK_CMD_NEW);
 	list_for_each_entry(param_item, &devlink->param_list, list)
@@ -6278,7 +6286,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	if (!devlink)
 		return NULL;
 	devlink->ops = ops;
-	devlink_net_set(devlink, &init_net);
+	__devlink_net_set(devlink, &init_net);
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
@@ -6304,6 +6312,7 @@ int devlink_register(struct devlink *devlink, struct device *dev)
 {
 	mutex_lock(&devlink_mutex);
 	devlink->dev = dev;
+	devlink->registered = true;
 	list_add_tail(&devlink->list, &devlink_list);
 	devlink_notify(devlink, DEVLINK_CMD_NEW);
 	mutex_unlock(&devlink_mutex);
-- 
cgit v1.2.3-59-g8ed1b


From 1927f41a22a05e3bc178fa47f7ce7be271fbc541 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sat, 5 Oct 2019 20:04:34 +0200
Subject: net: genetlink: introduce dump info struct to be available during
 dumpit op

Currently the cb->data is taken by ops during non-parallel dumping.
Introduce a new structure genl_dumpit_info and store the ops there.
Distribute the info to both non-parallel and parallel dumping. Also add
a helper genl_dumpit_info() to easily get the info structure in the
dumpit callback from cb.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 14 ++++++++++++++
 net/netlink/genetlink.c | 47 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 52 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 9292f1c588b7..fb838f4b0089 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -127,6 +127,20 @@ enum genl_validate_flags {
 	GENL_DONT_VALIDATE_DUMP_STRICT		= BIT(2),
 };
 
+/**
+ * struct genl_info - info that is available during dumpit op call
+ * @ops: generic netlink ops - for internal genl code usage
+ */
+struct genl_dumpit_info {
+	const struct genl_ops *ops;
+};
+
+static inline const struct genl_dumpit_info *
+genl_dumpit_info(struct netlink_callback *cb)
+{
+	return cb->data;
+}
+
 /**
  * struct genl_ops - generic netlink operations
  * @cmd: command identifier
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index b5fa98b1577d..c785080e9401 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -458,10 +458,19 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
 }
 EXPORT_SYMBOL(genlmsg_put);
 
+static struct genl_dumpit_info *genl_dumpit_info_alloc(void)
+{
+	return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL);
+}
+
+static void genl_dumpit_info_free(const struct genl_dumpit_info *info)
+{
+	kfree(info);
+}
+
 static int genl_lock_start(struct netlink_callback *cb)
 {
-	/* our ops are always const - netlink API doesn't propagate that */
-	const struct genl_ops *ops = cb->data;
+	const struct genl_ops *ops = genl_dumpit_info(cb)->ops;
 	int rc = 0;
 
 	if (ops->start) {
@@ -474,8 +483,7 @@ static int genl_lock_start(struct netlink_callback *cb)
 
 static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	/* our ops are always const - netlink API doesn't propagate that */
-	const struct genl_ops *ops = cb->data;
+	const struct genl_ops *ops = genl_dumpit_info(cb)->ops;
 	int rc;
 
 	genl_lock();
@@ -486,8 +494,8 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 
 static int genl_lock_done(struct netlink_callback *cb)
 {
-	/* our ops are always const - netlink API doesn't propagate that */
-	const struct genl_ops *ops = cb->data;
+	const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+	const struct genl_ops *ops = info->ops;
 	int rc = 0;
 
 	if (ops->done) {
@@ -495,6 +503,19 @@ static int genl_lock_done(struct netlink_callback *cb)
 		rc = ops->done(cb);
 		genl_unlock();
 	}
+	genl_dumpit_info_free(info);
+	return rc;
+}
+
+static int genl_parallel_done(struct netlink_callback *cb)
+{
+	const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+	const struct genl_ops *ops = info->ops;
+	int rc = 0;
+
+	if (ops->done)
+		rc = ops->done(cb);
+	genl_dumpit_info_free(info);
 	return rc;
 }
 
@@ -505,6 +526,7 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family,
 				      const struct genl_ops *ops,
 				      int hdrlen, struct net *net)
 {
+	struct genl_dumpit_info *info;
 	int err;
 
 	if (!ops->dumpit)
@@ -528,11 +550,17 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family,
 		}
 	}
 
+	/* Allocate dumpit info. It is going to be freed by done() callback. */
+	info = genl_dumpit_info_alloc();
+	if (!info)
+		return -ENOMEM;
+
+	info->ops = ops;
+
 	if (!family->parallel_ops) {
 		struct netlink_dump_control c = {
 			.module = family->module,
-			/* we have const, but the netlink API doesn't */
-			.data = (void *)ops,
+			.data = info,
 			.start = genl_lock_start,
 			.dump = genl_lock_dumpit,
 			.done = genl_lock_done,
@@ -545,9 +573,10 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family,
 	} else {
 		struct netlink_dump_control c = {
 			.module = family->module,
+			.data = info,
 			.start = ops->start,
 			.dump = ops->dumpit,
-			.done = ops->done,
+			.done = genl_parallel_done,
 		};
 
 		err = __netlink_dump_start(net->genl_sock, skb, nlh, &c);
-- 
cgit v1.2.3-59-g8ed1b


From bf813b0afeae2f012f0e527a526c1b78ca21ad82 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sat, 5 Oct 2019 20:04:36 +0200
Subject: net: genetlink: parse attrs and store in contect info struct during
 dumpit

Extend the dumpit info struct for attrs. Instead of existing attribute
validation do parse them and save in the info struct. Caller can benefit
from this and does not have to do parse itself. In order to properly
free attrs, genl_family pointer needs to be added to dumpit info struct
as well.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h |  4 ++++
 net/netlink/genetlink.c | 39 ++++++++++++++++++++++-----------------
 2 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index fb838f4b0089..922dcc9348b1 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -129,10 +129,14 @@ enum genl_validate_flags {
 
 /**
  * struct genl_info - info that is available during dumpit op call
+ * @family: generic netlink family - for internal genl code usage
  * @ops: generic netlink ops - for internal genl code usage
+ * @attrs: netlink attributes
  */
 struct genl_dumpit_info {
+	const struct genl_family *family;
 	const struct genl_ops *ops;
+	struct nlattr **attrs;
 };
 
 static inline const struct genl_dumpit_info *
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index a98c94594508..8059118ee5a1 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -542,6 +542,7 @@ static int genl_lock_done(struct netlink_callback *cb)
 		rc = ops->done(cb);
 		genl_unlock();
 	}
+	genl_family_rcv_msg_attrs_free(info->family, info->attrs);
 	genl_dumpit_info_free(info);
 	return rc;
 }
@@ -554,6 +555,7 @@ static int genl_parallel_done(struct netlink_callback *cb)
 
 	if (ops->done)
 		rc = ops->done(cb);
+	genl_family_rcv_msg_attrs_free(info->family, info->attrs);
 	genl_dumpit_info_free(info);
 	return rc;
 }
@@ -566,35 +568,38 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family,
 				      int hdrlen, struct net *net)
 {
 	struct genl_dumpit_info *info;
+	struct nlattr **attrs = NULL;
 	int err;
 
 	if (!ops->dumpit)
 		return -EOPNOTSUPP;
 
-	if (!(ops->validate & GENL_DONT_VALIDATE_DUMP)) {
-		if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
-			return -EINVAL;
+	if (ops->validate & GENL_DONT_VALIDATE_DUMP)
+		goto no_attrs;
 
-		if (family->maxattr) {
-			unsigned int validate = NL_VALIDATE_STRICT;
-
-			if (ops->validate & GENL_DONT_VALIDATE_DUMP_STRICT)
-				validate = NL_VALIDATE_LIBERAL;
-			err = __nla_validate(nlmsg_attrdata(nlh, hdrlen),
-					     nlmsg_attrlen(nlh, hdrlen),
-					     family->maxattr, family->policy,
-					     validate, extack);
-			if (err)
-				return err;
-		}
-	}
+	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+		return -EINVAL;
+
+	if (!family->maxattr)
+		goto no_attrs;
 
+	attrs = genl_family_rcv_msg_attrs_parse(family, nlh, extack,
+						ops, hdrlen,
+						GENL_DONT_VALIDATE_DUMP_STRICT);
+	if (IS_ERR(attrs))
+		return PTR_ERR(attrs);
+
+no_attrs:
 	/* Allocate dumpit info. It is going to be freed by done() callback. */
 	info = genl_dumpit_info_alloc();
-	if (!info)
+	if (!info) {
+		genl_family_rcv_msg_attrs_free(family, attrs);
 		return -ENOMEM;
+	}
 
+	info->family = family;
 	info->ops = ops;
+	info->attrs = attrs;
 
 	if (!family->parallel_ops) {
 		struct netlink_dump_control c = {
-- 
cgit v1.2.3-59-g8ed1b


From 265ecd4fa3f0ca43909f8b2cc0e519966f21b167 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sat, 5 Oct 2019 20:04:41 +0200
Subject: net: genetlink: remove unused genl_family_attrbuf()

genl_family_attrbuf() function is no longer used by anyone, so remove it.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h |  2 --
 net/netlink/genetlink.c | 19 -------------------
 2 files changed, 21 deletions(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 922dcc9348b1..74950663bb00 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -75,8 +75,6 @@ struct genl_family {
 	struct module		*module;
 };
 
-struct nlattr **genl_family_attrbuf(const struct genl_family *family);
-
 /**
  * struct genl_info - receiving information
  * @snd_seq: sending sequence number
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 8059118ee5a1..1b5046436765 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1164,25 +1164,6 @@ problem:
 
 subsys_initcall(genl_init);
 
-/**
- * genl_family_attrbuf - return family's attrbuf
- * @family: the family
- *
- * Return the family's attrbuf, while validating that it's
- * actually valid to access it.
- *
- * You cannot use this function with a family that has parallel_ops
- * and you can only use it within (pre/post) doit/dumpit callbacks.
- */
-struct nlattr **genl_family_attrbuf(const struct genl_family *family)
-{
-	if (!WARN_ON(family->parallel_ops))
-		lockdep_assert_held(&genl_mutex);
-
-	return family->attrbuf;
-}
-EXPORT_SYMBOL(genl_family_attrbuf);
-
 static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
 			 gfp_t flags)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 5f0e5412781b01708f622d00c0b3f77b9dca7367 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sun, 6 Oct 2019 20:07:36 -0700
Subject: uapi/bpf: fix helper docs

Various small fixes to BPF helper documentation comments, enabling
automatic header generation with a list of BPF helpers.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 32 ++++++++++++++++----------------
 tools/include/uapi/linux/bpf.h | 32 ++++++++++++++++----------------
 2 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77c6be96d676..a65c3b0c6935 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -794,7 +794,7 @@ union bpf_attr {
  * 		A 64-bit integer containing the current GID and UID, and
  * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
  *
- * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * int bpf_get_current_comm(void *buf, u32 size_of_buf)
  * 	Description
  * 		Copy the **comm** attribute of the current task into *buf* of
  * 		*size_of_buf*. The **comm** attribute contains the name of
@@ -1023,7 +1023,7 @@ union bpf_attr {
  * 		The realm of the route for the packet associated to *skb*, or 0
  * 		if none was found.
  *
- * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  * 	Description
  * 		Write raw *data* blob into a special BPF perf event held by
  * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1068,7 +1068,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len)
  * 	Description
  * 		This helper was provided as an easy way to load data from a
  * 		packet. It can be used to load *len* bytes from *offset* from
@@ -1085,7 +1085,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
+ * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags)
  * 	Description
  * 		Walk a user or a kernel stack and return its id. To achieve
  * 		this, the helper needs *ctx*, which is a pointer to the context
@@ -1154,7 +1154,7 @@ union bpf_attr {
  * 		The checksum result, or a negative error code in case of
  * 		failure.
  *
- * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
  * 	Description
  * 		Retrieve tunnel options metadata for the packet associated to
  * 		*skb*, and store the raw tunnel option data to the buffer *opt*
@@ -1172,7 +1172,7 @@ union bpf_attr {
  * 	Return
  * 		The size of the option data retrieved.
  *
- * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
  * 	Description
  * 		Set tunnel options metadata for the packet associated to *skb*
  * 		to the option data contained in the raw buffer *opt* of *size*.
@@ -1511,7 +1511,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **setsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1595,7 +1595,7 @@ union bpf_attr {
  * 	Return
  * 		**XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
  *
- * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
  * 	Description
  * 		Redirect the packet to the socket referenced by *map* (of type
  * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
@@ -1715,7 +1715,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **getsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1947,7 +1947,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags)
  * 	Description
  * 		Return a user or a kernel stack in bpf program provided buffer.
  * 		To achieve this, the helper needs *ctx*, which is a pointer
@@ -1980,7 +1980,7 @@ union bpf_attr {
  * 		A non-negative value equal to or less than *size* on success,
  * 		or a negative error in case of failure.
  *
- * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
  * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
  * 		it provides an easy way to load *len* bytes from *offset*
@@ -2033,7 +2033,7 @@ union bpf_attr {
  *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
  *		  packet is not forwarded or needs assist from full stack
  *
- * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		Add an entry to, or update a sockhash *map* referencing sockets.
  *		The *skops* is used as a new value for the entry associated to
@@ -2392,7 +2392,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
+ * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
  *	Description
  *		For socket policies, insert *len* bytes into *msg* at offset
  *		*start*.
@@ -2408,9 +2408,9 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
  *	Description
- *		Will remove *pop* bytes from a *msg* starting at byte *start*.
+ *		Will remove *len* bytes from a *msg* starting at byte *start*.
  *		This may result in **ENOMEM** errors under certain situations if
  *		an allocation and copy are required due to a full ring buffer.
  *		However, the helper will try to avoid doing the allocation
@@ -2505,7 +2505,7 @@ union bpf_attr {
  *		A **struct bpf_tcp_sock** pointer on success, or **NULL** in
  *		case of failure.
  *
- * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
+ * int bpf_skb_ecn_set_ce(struct sk_buff *skb)
  *	Description
  *		Set ECN (Explicit Congestion Notification) field of IP header
  *		to **CE** (Congestion Encountered) if current value is **ECT**
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 77c6be96d676..a65c3b0c6935 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -794,7 +794,7 @@ union bpf_attr {
  * 		A 64-bit integer containing the current GID and UID, and
  * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
  *
- * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * int bpf_get_current_comm(void *buf, u32 size_of_buf)
  * 	Description
  * 		Copy the **comm** attribute of the current task into *buf* of
  * 		*size_of_buf*. The **comm** attribute contains the name of
@@ -1023,7 +1023,7 @@ union bpf_attr {
  * 		The realm of the route for the packet associated to *skb*, or 0
  * 		if none was found.
  *
- * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  * 	Description
  * 		Write raw *data* blob into a special BPF perf event held by
  * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1068,7 +1068,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len)
  * 	Description
  * 		This helper was provided as an easy way to load data from a
  * 		packet. It can be used to load *len* bytes from *offset* from
@@ -1085,7 +1085,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
+ * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags)
  * 	Description
  * 		Walk a user or a kernel stack and return its id. To achieve
  * 		this, the helper needs *ctx*, which is a pointer to the context
@@ -1154,7 +1154,7 @@ union bpf_attr {
  * 		The checksum result, or a negative error code in case of
  * 		failure.
  *
- * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
  * 	Description
  * 		Retrieve tunnel options metadata for the packet associated to
  * 		*skb*, and store the raw tunnel option data to the buffer *opt*
@@ -1172,7 +1172,7 @@ union bpf_attr {
  * 	Return
  * 		The size of the option data retrieved.
  *
- * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
  * 	Description
  * 		Set tunnel options metadata for the packet associated to *skb*
  * 		to the option data contained in the raw buffer *opt* of *size*.
@@ -1511,7 +1511,7 @@ union bpf_attr {
  * 	Return
  * 		0
  *
- * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **setsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1595,7 +1595,7 @@ union bpf_attr {
  * 	Return
  * 		**XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
  *
- * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
  * 	Description
  * 		Redirect the packet to the socket referenced by *map* (of type
  * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
@@ -1715,7 +1715,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
+ * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen)
  * 	Description
  * 		Emulate a call to **getsockopt()** on the socket associated to
  * 		*bpf_socket*, which must be a full socket. The *level* at
@@ -1947,7 +1947,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags)
  * 	Description
  * 		Return a user or a kernel stack in bpf program provided buffer.
  * 		To achieve this, the helper needs *ctx*, which is a pointer
@@ -1980,7 +1980,7 @@ union bpf_attr {
  * 		A non-negative value equal to or less than *size* on success,
  * 		or a negative error in case of failure.
  *
- * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
  * 	Description
  * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
  * 		it provides an easy way to load *len* bytes from *offset*
@@ -2033,7 +2033,7 @@ union bpf_attr {
  *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
  *		  packet is not forwarded or needs assist from full stack
  *
- * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		Add an entry to, or update a sockhash *map* referencing sockets.
  *		The *skops* is used as a new value for the entry associated to
@@ -2392,7 +2392,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
+ * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
  *	Description
  *		For socket policies, insert *len* bytes into *msg* at offset
  *		*start*.
@@ -2408,9 +2408,9 @@ union bpf_attr {
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
  *	Description
- *		Will remove *pop* bytes from a *msg* starting at byte *start*.
+ *		Will remove *len* bytes from a *msg* starting at byte *start*.
  *		This may result in **ENOMEM** errors under certain situations if
  *		an allocation and copy are required due to a full ring buffer.
  *		However, the helper will try to avoid doing the allocation
@@ -2505,7 +2505,7 @@ union bpf_attr {
  *		A **struct bpf_tcp_sock** pointer on success, or **NULL** in
  *		case of failure.
  *
- * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
+ * int bpf_skb_ecn_set_ce(struct sk_buff *skb)
  *	Description
  *		Set ECN (Explicit Congestion Notification) field of IP header
  *		to **CE** (Congestion Encountered) if current value is **ECT**
-- 
cgit v1.2.3-59-g8ed1b


From b9df4fd7e99cb8bfd80c4143f3045d63b1754ad0 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 6 Oct 2019 18:19:54 +0200
Subject: net: core: change return type of pskb_may_pull to bool

This function de-facto returns a bool, so let's change the return type
accordingly.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4351577b14d7..0a58402a166e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2261,12 +2261,12 @@ static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
 	return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
 }
 
-static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
+static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
 {
 	if (likely(len <= skb_headlen(skb)))
-		return 1;
+		return true;
 	if (unlikely(len > skb->len))
-		return 0;
+		return false;
 	return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 328908621081c3c7455c39549c5334e74b7c525a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 7 Oct 2019 09:37:27 -0400
Subject: ipv6: Make ipv6_mc_may_pull() return bool.

Consistent with how pskb_may_pull() also now does so.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 3f62b347b04a..1bab88184d3c 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -202,11 +202,11 @@ u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr,
 /*
  *	multicast prototypes (mcast.c)
  */
-static inline int ipv6_mc_may_pull(struct sk_buff *skb,
-				   unsigned int len)
+static inline bool ipv6_mc_may_pull(struct sk_buff *skb,
+				    unsigned int len)
 {
 	if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len)
-		return 0;
+		return false;
 
 	return pskb_may_pull(skb, len);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 163ab96b52ae2bb2d8f188cd29f0b570610f9007 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Sun, 6 Oct 2019 21:09:27 -0700
Subject: net: sockmap: use bitmap for copy info

Don't use bool array in struct sk_msg_sg, save 12 bytes.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skmsg.h | 12 ++++++++----
 net/core/filter.c     |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e4b3fb4bb77c..fe80d537945d 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -28,13 +28,14 @@ struct sk_msg_sg {
 	u32				end;
 	u32				size;
 	u32				copybreak;
-	bool				copy[MAX_MSG_FRAGS];
+	unsigned long			copy;
 	/* The extra element is used for chaining the front and sections when
 	 * the list becomes partitioned (e.g. end < start). The crypto APIs
 	 * require the chaining.
 	 */
 	struct scatterlist		data[MAX_MSG_FRAGS + 1];
 };
+static_assert(BITS_PER_LONG >= MAX_MSG_FRAGS);
 
 /* UAPI in filter.c depends on struct sk_msg_sg being first element. */
 struct sk_msg {
@@ -227,7 +228,7 @@ static inline void sk_msg_compute_data_pointers(struct sk_msg *msg)
 {
 	struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start);
 
-	if (msg->sg.copy[msg->sg.start]) {
+	if (test_bit(msg->sg.start, &msg->sg.copy)) {
 		msg->data = NULL;
 		msg->data_end = NULL;
 	} else {
@@ -246,7 +247,7 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
 	sg_set_page(sge, page, len, offset);
 	sg_unmark_end(sge);
 
-	msg->sg.copy[msg->sg.end] = true;
+	__set_bit(msg->sg.end, &msg->sg.copy);
 	msg->sg.size += len;
 	sk_msg_iter_next(msg, end);
 }
@@ -254,7 +255,10 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
 static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state)
 {
 	do {
-		msg->sg.copy[i] = copy_state;
+		if (copy_state)
+			__set_bit(i, &msg->sg.copy);
+		else
+			__clear_bit(i, &msg->sg.copy);
 		sk_msg_iter_var_next(i);
 		if (i == msg->sg.end)
 			break;
diff --git a/net/core/filter.c b/net/core/filter.c
index ed6563622ce3..46196e212413 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2245,7 +2245,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 	 * account for the headroom.
 	 */
 	bytes_sg_total = start - offset + bytes;
-	if (!msg->sg.copy[i] && bytes_sg_total <= len)
+	if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
 		goto out;
 
 	/* At this point we need to linearize multiple scatterlist
@@ -2450,7 +2450,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
 	/* Place newly allocated data buffer */
 	sk_mem_charge(msg->sk, len);
 	msg->sg.size += len;
-	msg->sg.copy[new] = false;
+	__clear_bit(new, &msg->sg.copy);
 	sg_set_page(&msg->sg.data[new], page, len + copy, 0);
 	if (rsge.length) {
 		get_page(sg_page(&rsge));
-- 
cgit v1.2.3-59-g8ed1b


From 4de30a8d58c90e18140342cdcb74903d2e4fbb62 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Sun, 6 Oct 2019 21:09:30 -0700
Subject: net/tls: pass context to tls_device_decrypted()

Avoid unnecessary pointer chasing and calculations, callers already
have most of the state tls_device_decrypted() needs.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h    | 7 +++++--
 net/tls/tls_device.c | 5 ++---
 net/tls/tls_sw.c     | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 24c37bffc961..b809f2362049 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -641,7 +641,8 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx);
 void tls_device_offload_cleanup_rx(struct sock *sk);
 void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq);
 void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq);
-int tls_device_decrypted(struct sock *sk, struct sk_buff *skb);
+int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+			 struct sk_buff *skb, struct strp_msg *rxm);
 #else
 static inline void tls_device_init(void) {}
 static inline void tls_device_cleanup(void) {}
@@ -664,7 +665,9 @@ static inline void tls_device_offload_cleanup_rx(struct sock *sk) {}
 static inline void
 tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {}
 
-static inline int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
+static inline int
+tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+		     struct sk_buff *skb, struct strp_msg *rxm)
 {
 	return 0;
 }
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 23c19b8ff04e..33b267b052c0 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -846,11 +846,10 @@ free_buf:
 	return err;
 }
 
-int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
+int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+			 struct sk_buff *skb, struct strp_msg *rxm)
 {
-	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx);
-	struct strp_msg *rxm = strp_msg(skb);
 	int is_decrypted = skb->decrypted;
 	int is_encrypted = !is_decrypted;
 	struct sk_buff *skb_iter;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 0b1e86f856eb..954f451dcc57 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1495,7 +1495,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 
 	if (!ctx->decrypted) {
 		if (tls_ctx->rx_conf == TLS_HW) {
-			err = tls_device_decrypted(sk, skb);
+			err = tls_device_decrypted(sk, tls_ctx, skb, rxm);
 			if (err < 0)
 				return err;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 5c5458ec9d631fbca29f53a944168265e18aa77a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Sun, 6 Oct 2019 21:09:31 -0700
Subject: net/tls: store async_capable on a single bit

Store async_capable on a single bit instead of a full integer
to save space.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 4 ++--
 net/tls/tls_sw.c  | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index b809f2362049..97eae7271a67 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -136,7 +136,7 @@ struct tls_sw_context_tx {
 	struct list_head tx_list;
 	atomic_t encrypt_pending;
 	int async_notify;
-	int async_capable;
+	u8 async_capable:1;
 
 #define BIT_TX_SCHEDULED	0
 #define BIT_TX_CLOSING		1
@@ -152,7 +152,7 @@ struct tls_sw_context_rx {
 
 	struct sk_buff *recv_pkt;
 	u8 control;
-	int async_capable;
+	u8 async_capable:1;
 	bool decrypted;
 	atomic_t decrypt_pending;
 	bool async_notify;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 954f451dcc57..c006b587a7db 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2391,10 +2391,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 		tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv);
 
 		if (crypto_info->version == TLS_1_3_VERSION)
-			sw_ctx_rx->async_capable = false;
+			sw_ctx_rx->async_capable = 0;
 		else
 			sw_ctx_rx->async_capable =
-				tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC;
+				!!(tfm->__crt_alg->cra_flags &
+				   CRYPTO_ALG_ASYNC);
 
 		/* Set up strparser */
 		memset(&cb, 0, sizeof(cb));
-- 
cgit v1.2.3-59-g8ed1b


From bc76e5bb1229ede1f26317b813099b0e983e4009 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Sun, 6 Oct 2019 21:09:32 -0700
Subject: net/tls: store decrypted on a single bit

Use a single bit instead of boolean to remember if packet
was already decrypted.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 2 +-
 net/tls/tls_sw.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 97eae7271a67..41265e542e71 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -153,7 +153,7 @@ struct tls_sw_context_rx {
 	struct sk_buff *recv_pkt;
 	u8 control;
 	u8 async_capable:1;
-	bool decrypted;
+	u8 decrypted:1;
 	atomic_t decrypt_pending;
 	bool async_notify;
 };
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index c006b587a7db..de7561d4cfa5 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1523,7 +1523,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
 		rxm->offset += prot->prepend_size;
 		rxm->full_len -= prot->overhead_size;
 		tls_advance_record_sn(sk, prot, &tls_ctx->rx);
-		ctx->decrypted = true;
+		ctx->decrypted = 1;
 		ctx->saved_data_ready(sk);
 	} else {
 		*zc = false;
@@ -1933,7 +1933,7 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
 			tls_err_abort(sk, EBADMSG);
 			goto splice_read_end;
 		}
-		ctx->decrypted = true;
+		ctx->decrypted = 1;
 	}
 	rxm = strp_msg(skb);
 
@@ -2034,7 +2034,7 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
 	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
 	struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
 
-	ctx->decrypted = false;
+	ctx->decrypted = 0;
 
 	ctx->recv_pkt = skb;
 	strp_pause(strp);
-- 
cgit v1.2.3-59-g8ed1b


From 017f77c050a3bc1f1ff877d1f265beeee26d7dea Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Thu, 3 Oct 2019 20:56:01 +0100
Subject: netfilter: ipset: add a coding-style fix to ip_set_ext_destroy.

Use a local variable to hold comment in order to align the arguments of
ip_set_comment_free properly.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 9bc255a8461b..9fee4837d02c 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -269,9 +269,11 @@ ip_set_ext_destroy(struct ip_set *set, void *data)
 	/* Check that the extension is enabled for the set and
 	 * call it's destroy function for its extension part in data.
 	 */
-	if (SET_WITH_COMMENT(set))
-		ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy(
-			set, ext_comment(data, set));
+	if (SET_WITH_COMMENT(set)) {
+		struct ip_set_comment *c = ext_comment(data, set);
+
+		ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy(set, c);
+	}
 }
 
 static inline int
-- 
cgit v1.2.3-59-g8ed1b


From 94177f6e11c74b6ca3bcf7f65d3d74f00bbd6a8c Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Thu, 3 Oct 2019 20:56:03 +0100
Subject: netfilter: ipset: move ip_set_comment functions from ip_set.h to
 ip_set_core.c.

Most of the functions are only called from within ip_set_core.c.

The exception is ip_set_init_comment.  However, this is too complex to
be a good candidate for a static inline function.  Move it to
ip_set_core.c, change its linkage to extern and export it, leaving a
declaration in ip_set.h.

ip_set_comment_free is only used as an extension destructor, so change
its prototype to match and drop cast.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 63 ++------------------------------
 net/netfilter/ipset/ip_set_core.c      | 66 +++++++++++++++++++++++++++++++++-
 2 files changed, 67 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 9fee4837d02c..985c9bb1ab65 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -521,67 +521,8 @@ ip_set_timeout_get(const unsigned long *timeout)
 	return t == 0 ? 1 : t;
 }
 
-static inline char*
-ip_set_comment_uget(struct nlattr *tb)
-{
-	return nla_data(tb);
-}
-
-/* Called from uadd only, protected by the set spinlock.
- * The kadt functions don't use the comment extensions in any way.
- */
-static inline void
-ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
-		    const struct ip_set_ext *ext)
-{
-	struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1);
-	size_t len = ext->comment ? strlen(ext->comment) : 0;
-
-	if (unlikely(c)) {
-		set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
-		kfree_rcu(c, rcu);
-		rcu_assign_pointer(comment->c, NULL);
-	}
-	if (!len)
-		return;
-	if (unlikely(len > IPSET_MAX_COMMENT_SIZE))
-		len = IPSET_MAX_COMMENT_SIZE;
-	c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
-	if (unlikely(!c))
-		return;
-	strlcpy(c->str, ext->comment, len + 1);
-	set->ext_size += sizeof(*c) + strlen(c->str) + 1;
-	rcu_assign_pointer(comment->c, c);
-}
-
-/* Used only when dumping a set, protected by rcu_read_lock() */
-static inline int
-ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment)
-{
-	struct ip_set_comment_rcu *c = rcu_dereference(comment->c);
-
-	if (!c)
-		return 0;
-	return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str);
-}
-
-/* Called from uadd/udel, flush or the garbage collectors protected
- * by the set spinlock.
- * Called when the set is destroyed and when there can't be any user
- * of the set data anymore.
- */
-static inline void
-ip_set_comment_free(struct ip_set *set, struct ip_set_comment *comment)
-{
-	struct ip_set_comment_rcu *c;
-
-	c = rcu_dereference_protected(comment->c, 1);
-	if (unlikely(!c))
-		return;
-	set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
-	kfree_rcu(c, rcu);
-	rcu_assign_pointer(comment->c, NULL);
-}
+void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
+			 const struct ip_set_ext *ext);
 
 static inline void
 ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 04266295a750..73daea6d4bd5 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -325,6 +325,70 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
 }
 EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
 
+static char *
+ip_set_comment_uget(struct nlattr *tb)
+{
+	return nla_data(tb);
+}
+
+/* Called from uadd only, protected by the set spinlock.
+ * The kadt functions don't use the comment extensions in any way.
+ */
+void
+ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
+		    const struct ip_set_ext *ext)
+{
+	struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1);
+	size_t len = ext->comment ? strlen(ext->comment) : 0;
+
+	if (unlikely(c)) {
+		set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
+		kfree_rcu(c, rcu);
+		rcu_assign_pointer(comment->c, NULL);
+	}
+	if (!len)
+		return;
+	if (unlikely(len > IPSET_MAX_COMMENT_SIZE))
+		len = IPSET_MAX_COMMENT_SIZE;
+	c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
+	if (unlikely(!c))
+		return;
+	strlcpy(c->str, ext->comment, len + 1);
+	set->ext_size += sizeof(*c) + strlen(c->str) + 1;
+	rcu_assign_pointer(comment->c, c);
+}
+EXPORT_SYMBOL_GPL(ip_set_init_comment);
+
+/* Used only when dumping a set, protected by rcu_read_lock() */
+static int
+ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment)
+{
+	struct ip_set_comment_rcu *c = rcu_dereference(comment->c);
+
+	if (!c)
+		return 0;
+	return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str);
+}
+
+/* Called from uadd/udel, flush or the garbage collectors protected
+ * by the set spinlock.
+ * Called when the set is destroyed and when there can't be any user
+ * of the set data anymore.
+ */
+static void
+ip_set_comment_free(struct ip_set *set, void *ptr)
+{
+	struct ip_set_comment *comment = ptr;
+	struct ip_set_comment_rcu *c;
+
+	c = rcu_dereference_protected(comment->c, 1);
+	if (unlikely(!c))
+		return;
+	set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
+	kfree_rcu(c, rcu);
+	rcu_assign_pointer(comment->c, NULL);
+}
+
 typedef void (*destroyer)(struct ip_set *, void *);
 /* ipset data extension types, in size order */
 
@@ -351,7 +415,7 @@ const struct ip_set_ext_type ip_set_extensions[] = {
 		.flag	 = IPSET_FLAG_WITH_COMMENT,
 		.len	 = sizeof(struct ip_set_comment),
 		.align	 = __alignof__(struct ip_set_comment),
-		.destroy = (destroyer) ip_set_comment_free,
+		.destroy = ip_set_comment_free,
 	},
 };
 EXPORT_SYMBOL_GPL(ip_set_extensions);
-- 
cgit v1.2.3-59-g8ed1b


From 2398a97688f1aaca09d0a5a809f361e2abf5ff3c Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Thu, 3 Oct 2019 20:56:04 +0100
Subject: netfilter: ipset: move functions to ip_set_core.c.

Several inline functions in ip_set.h are only called in ip_set_core.c:
move them and remove inline function specifier.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 102 ---------------------------------
 net/netfilter/ipset/ip_set_core.c      | 102 +++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 102 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 985c9bb1ab65..44f6de8a1733 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -508,86 +508,9 @@ ip_set_timeout_set(unsigned long *timeout, u32 value)
 	*timeout = t;
 }
 
-static inline u32
-ip_set_timeout_get(const unsigned long *timeout)
-{
-	u32 t;
-
-	if (*timeout == IPSET_ELEM_PERMANENT)
-		return 0;
-
-	t = jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC;
-	/* Zero value in userspace means no timeout */
-	return t == 0 ? 1 : t;
-}
-
 void ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
 			 const struct ip_set_ext *ext);
 
-static inline void
-ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
-{
-	atomic64_add((long long)bytes, &(counter)->bytes);
-}
-
-static inline void
-ip_set_add_packets(u64 packets, struct ip_set_counter *counter)
-{
-	atomic64_add((long long)packets, &(counter)->packets);
-}
-
-static inline u64
-ip_set_get_bytes(const struct ip_set_counter *counter)
-{
-	return (u64)atomic64_read(&(counter)->bytes);
-}
-
-static inline u64
-ip_set_get_packets(const struct ip_set_counter *counter)
-{
-	return (u64)atomic64_read(&(counter)->packets);
-}
-
-static inline bool
-ip_set_match_counter(u64 counter, u64 match, u8 op)
-{
-	switch (op) {
-	case IPSET_COUNTER_NONE:
-		return true;
-	case IPSET_COUNTER_EQ:
-		return counter == match;
-	case IPSET_COUNTER_NE:
-		return counter != match;
-	case IPSET_COUNTER_LT:
-		return counter < match;
-	case IPSET_COUNTER_GT:
-		return counter > match;
-	}
-	return false;
-}
-
-static inline void
-ip_set_update_counter(struct ip_set_counter *counter,
-		      const struct ip_set_ext *ext, u32 flags)
-{
-	if (ext->packets != ULLONG_MAX &&
-	    !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
-		ip_set_add_bytes(ext->bytes, counter);
-		ip_set_add_packets(ext->packets, counter);
-	}
-}
-
-static inline bool
-ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
-{
-	return nla_put_net64(skb, IPSET_ATTR_BYTES,
-			     cpu_to_be64(ip_set_get_bytes(counter)),
-			     IPSET_ATTR_PAD) ||
-	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
-			     cpu_to_be64(ip_set_get_packets(counter)),
-			     IPSET_ATTR_PAD);
-}
-
 static inline void
 ip_set_init_counter(struct ip_set_counter *counter,
 		    const struct ip_set_ext *ext)
@@ -598,31 +521,6 @@ ip_set_init_counter(struct ip_set_counter *counter,
 		atomic64_set(&(counter)->packets, (long long)(ext->packets));
 }
 
-static inline void
-ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
-		   const struct ip_set_ext *ext,
-		   struct ip_set_ext *mext, u32 flags)
-{
-	mext->skbinfo = *skbinfo;
-}
-
-static inline bool
-ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
-{
-	/* Send nonzero parameters only */
-	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
-		nla_put_net64(skb, IPSET_ATTR_SKBMARK,
-			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
-					  skbinfo->skbmarkmask),
-			      IPSET_ATTR_PAD)) ||
-	       (skbinfo->skbprio &&
-		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
-			      cpu_to_be32(skbinfo->skbprio))) ||
-	       (skbinfo->skbqueue &&
-		nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
-			      cpu_to_be16(skbinfo->skbqueue)));
-}
-
 static inline void
 ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 		    const struct ip_set_ext *ext)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 73daea6d4bd5..30bc7df2f4cf 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -325,6 +325,19 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
 }
 EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
 
+static u32
+ip_set_timeout_get(const unsigned long *timeout)
+{
+	u32 t;
+
+	if (*timeout == IPSET_ELEM_PERMANENT)
+		return 0;
+
+	t = jiffies_to_msecs(*timeout - jiffies) / MSEC_PER_SEC;
+	/* Zero value in userspace means no timeout */
+	return t == 0 ? 1 : t;
+}
+
 static char *
 ip_set_comment_uget(struct nlattr *tb)
 {
@@ -510,6 +523,46 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
 }
 EXPORT_SYMBOL_GPL(ip_set_get_extensions);
 
+static u64
+ip_set_get_bytes(const struct ip_set_counter *counter)
+{
+	return (u64)atomic64_read(&(counter)->bytes);
+}
+
+static u64
+ip_set_get_packets(const struct ip_set_counter *counter)
+{
+	return (u64)atomic64_read(&(counter)->packets);
+}
+
+static bool
+ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
+{
+	return nla_put_net64(skb, IPSET_ATTR_BYTES,
+			     cpu_to_be64(ip_set_get_bytes(counter)),
+			     IPSET_ATTR_PAD) ||
+	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
+			     cpu_to_be64(ip_set_get_packets(counter)),
+			     IPSET_ATTR_PAD);
+}
+
+static bool
+ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
+{
+	/* Send nonzero parameters only */
+	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
+		nla_put_net64(skb, IPSET_ATTR_SKBMARK,
+			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
+					  skbinfo->skbmarkmask),
+			      IPSET_ATTR_PAD)) ||
+	       (skbinfo->skbprio &&
+		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
+			      cpu_to_be32(skbinfo->skbprio))) ||
+	       (skbinfo->skbqueue &&
+		nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
+			      cpu_to_be16(skbinfo->skbqueue)));
+}
+
 int
 ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
 		      const void *e, bool active)
@@ -535,6 +588,55 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
 }
 EXPORT_SYMBOL_GPL(ip_set_put_extensions);
 
+static bool
+ip_set_match_counter(u64 counter, u64 match, u8 op)
+{
+	switch (op) {
+	case IPSET_COUNTER_NONE:
+		return true;
+	case IPSET_COUNTER_EQ:
+		return counter == match;
+	case IPSET_COUNTER_NE:
+		return counter != match;
+	case IPSET_COUNTER_LT:
+		return counter < match;
+	case IPSET_COUNTER_GT:
+		return counter > match;
+	}
+	return false;
+}
+
+static void
+ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
+{
+	atomic64_add((long long)bytes, &(counter)->bytes);
+}
+
+static void
+ip_set_add_packets(u64 packets, struct ip_set_counter *counter)
+{
+	atomic64_add((long long)packets, &(counter)->packets);
+}
+
+static void
+ip_set_update_counter(struct ip_set_counter *counter,
+		      const struct ip_set_ext *ext, u32 flags)
+{
+	if (ext->packets != ULLONG_MAX &&
+	    !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
+		ip_set_add_bytes(ext->bytes, counter);
+		ip_set_add_packets(ext->packets, counter);
+	}
+}
+
+static void
+ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
+		   const struct ip_set_ext *ext,
+		   struct ip_set_ext *mext, u32 flags)
+{
+	mext->skbinfo = *skbinfo;
+}
+
 bool
 ip_set_match_extensions(struct ip_set *set, const struct ip_set_ext *ext,
 			struct ip_set_ext *mext, u32 flags, void *data)
-- 
cgit v1.2.3-59-g8ed1b


From 856391854ce73015fbe2b235f5886205aab166b0 Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Thu, 3 Oct 2019 20:56:05 +0100
Subject: netfilter: ipset: make ip_set_put_flags extern.

ip_set_put_flags is rather large for a static inline function in a
header-file.  Move it to ip_set_core.c and export it.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 23 +----------------------
 net/netfilter/ipset/ip_set_core.c      | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 44f6de8a1733..4d8b1eaf7708 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -276,28 +276,7 @@ ip_set_ext_destroy(struct ip_set *set, void *data)
 	}
 }
 
-static inline int
-ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
-{
-	u32 cadt_flags = 0;
-
-	if (SET_WITH_TIMEOUT(set))
-		if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
-					   htonl(set->timeout))))
-			return -EMSGSIZE;
-	if (SET_WITH_COUNTER(set))
-		cadt_flags |= IPSET_FLAG_WITH_COUNTERS;
-	if (SET_WITH_COMMENT(set))
-		cadt_flags |= IPSET_FLAG_WITH_COMMENT;
-	if (SET_WITH_SKBINFO(set))
-		cadt_flags |= IPSET_FLAG_WITH_SKBINFO;
-	if (SET_WITH_FORCEADD(set))
-		cadt_flags |= IPSET_FLAG_WITH_FORCEADD;
-
-	if (!cadt_flags)
-		return 0;
-	return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags));
-}
+int ip_set_put_flags(struct sk_buff *skb, struct ip_set *set);
 
 /* Netlink CB args */
 enum {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 30bc7df2f4cf..35cf59e4004b 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1418,6 +1418,30 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 #define DUMP_TYPE(arg)		(((u32)(arg)) & 0x0000FFFF)
 #define DUMP_FLAGS(arg)		(((u32)(arg)) >> 16)
 
+int
+ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
+{
+	u32 cadt_flags = 0;
+
+	if (SET_WITH_TIMEOUT(set))
+		if (unlikely(nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+					   htonl(set->timeout))))
+			return -EMSGSIZE;
+	if (SET_WITH_COUNTER(set))
+		cadt_flags |= IPSET_FLAG_WITH_COUNTERS;
+	if (SET_WITH_COMMENT(set))
+		cadt_flags |= IPSET_FLAG_WITH_COMMENT;
+	if (SET_WITH_SKBINFO(set))
+		cadt_flags |= IPSET_FLAG_WITH_SKBINFO;
+	if (SET_WITH_FORCEADD(set))
+		cadt_flags |= IPSET_FLAG_WITH_FORCEADD;
+
+	if (!cadt_flags)
+		return 0;
+	return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags));
+}
+EXPORT_SYMBOL_GPL(ip_set_put_flags);
+
 static int
 ip_set_dump_done(struct netlink_callback *cb)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 3fbd6c4513b5c27465a1dcf2e4286e6c3183bb1f Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Thu, 3 Oct 2019 20:56:06 +0100
Subject: netfilter: ipset: move function to ip_set_bitmap_ip.c.

One inline function in ip_set_bitmap.h is only called in
ip_set_bitmap_ip.c: move it and remove inline function specifier.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set_bitmap.h | 14 --------------
 net/netfilter/ipset/ip_set_bitmap_ip.c        | 12 ++++++++++++
 2 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set_bitmap.h b/include/linux/netfilter/ipset/ip_set_bitmap.h
index 2dddbc6dcac7..fcc4d214a788 100644
--- a/include/linux/netfilter/ipset/ip_set_bitmap.h
+++ b/include/linux/netfilter/ipset/ip_set_bitmap.h
@@ -12,18 +12,4 @@ enum {
 	IPSET_ADD_START_STORED_TIMEOUT,
 };
 
-/* Common functions */
-
-static inline u32
-range_to_mask(u32 from, u32 to, u8 *bits)
-{
-	u32 mask = 0xFFFFFFFE;
-
-	*bits = 32;
-	while (--(*bits) > 0 && mask && (to & mask) != from)
-		mask <<= 1;
-
-	return mask;
-}
-
 #endif /* __IP_SET_BITMAP_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index c06172d5b017..abe8f77d7d23 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -237,6 +237,18 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
 	return true;
 }
 
+static u32
+range_to_mask(u32 from, u32 to, u8 *bits)
+{
+	u32 mask = 0xFFFFFFFE;
+
+	*bits = 32;
+	while (--(*bits) > 0 && mask && (to & mask) != from)
+		mask <<= 1;
+
+	return mask;
+}
+
 static int
 bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
 		 u32 flags)
-- 
cgit v1.2.3-59-g8ed1b


From f8615bf8a3dabd84bf844c6f888929495039d389 Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Thu, 3 Oct 2019 20:56:07 +0100
Subject: netfilter: ipset: move ip_set_get_ip_port() to ip_set_bitmap_port.c.

ip_set_get_ip_port() is only used in ip_set_bitmap_port.c.  Move it
there and make it static.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set_getport.h |  3 ---
 net/netfilter/ipset/ip_set_bitmap_port.c       | 27 +++++++++++++++++++++++++
 net/netfilter/ipset/ip_set_getport.c           | 28 --------------------------
 3 files changed, 27 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set_getport.h b/include/linux/netfilter/ipset/ip_set_getport.h
index d74cd112b88a..1ecaabd9a048 100644
--- a/include/linux/netfilter/ipset/ip_set_getport.h
+++ b/include/linux/netfilter/ipset/ip_set_getport.h
@@ -20,9 +20,6 @@ static inline bool ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
 }
 #endif
 
-extern bool ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src,
-				__be16 *port);
-
 static inline bool ip_set_proto_with_ports(u8 proto)
 {
 	switch (proto) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index 72fede25469d..23d6095cb196 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -96,6 +96,33 @@ bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map)
 	       nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
 }
 
+static bool
+ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
+{
+	bool ret;
+	u8 proto;
+
+	switch (pf) {
+	case NFPROTO_IPV4:
+		ret = ip_set_get_ip4_port(skb, src, port, &proto);
+		break;
+	case NFPROTO_IPV6:
+		ret = ip_set_get_ip6_port(skb, src, port, &proto);
+		break;
+	default:
+		return false;
+	}
+	if (!ret)
+		return ret;
+	switch (proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int
 bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
 		 const struct xt_action_param *par,
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 2b8f959574b4..36615eb3eae1 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -148,31 +148,3 @@ ip_set_get_ip6_port(const struct sk_buff *skb, bool src,
 }
 EXPORT_SYMBOL_GPL(ip_set_get_ip6_port);
 #endif
-
-bool
-ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port)
-{
-	bool ret;
-	u8 proto;
-
-	switch (pf) {
-	case NFPROTO_IPV4:
-		ret = ip_set_get_ip4_port(skb, src, port, &proto);
-		break;
-	case NFPROTO_IPV6:
-		ret = ip_set_get_ip6_port(skb, src, port, &proto);
-		break;
-	default:
-		return false;
-	}
-	if (!ret)
-		return ret;
-	switch (proto) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-		return true;
-	default:
-		return false;
-	}
-}
-EXPORT_SYMBOL_GPL(ip_set_get_ip_port);
-- 
cgit v1.2.3-59-g8ed1b


From 7d47433cf74f942a414171867d89c08640cfef45 Mon Sep 17 00:00:00 2001
From: Yamin Friedman <yaminf@mellanox.com>
Date: Mon, 7 Oct 2019 16:59:31 +0300
Subject: net/mlx5: Expose optimal performance scatter entries capability

Expose maximum scatter entries per RDMA READ for optimal performance.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 138c50d5a353..c0bfb1d90dd2 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1153,7 +1153,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_srq[0x5];
 	u8         reserved_at_b0[0x10];
 
-	u8         reserved_at_c0[0x8];
+	u8         max_sgl_for_optimized_performance[0x8];
 	u8         log_max_cq_sz[0x8];
 	u8         reserved_at_d0[0xb];
 	u8         log_max_cq[0x5];
-- 
cgit v1.2.3-59-g8ed1b


From 5d5a0815f854a5b0e21d97e16cfadad69ce5fb04 Mon Sep 17 00:00:00 2001
From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Fri, 27 Sep 2019 12:54:50 +0800
Subject: ipvs: batch __ip_vs_cleanup

It's better to batch __ip_vs_cleanup to speedup ipvs
connections dismantle.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |  2 +-
 net/netfilter/ipvs/ip_vs_core.c | 28 ++++++++++++++++------------
 net/netfilter/ipvs/ip_vs_ctl.c  | 12 +++++++++---
 3 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 3759167f91f5..93e7a252993d 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1324,7 +1324,7 @@ void ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs);
 void ip_vs_control_net_cleanup(struct netns_ipvs *ipvs);
 void ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs);
 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs);
-void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs);
+void ip_vs_service_nets_cleanup(struct list_head *net_list);
 
 /* IPVS application functions
  * (from ip_vs_app.c)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 8b80ab794a92..93cfb47823d1 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -2402,18 +2402,22 @@ estimator_fail:
 	return -ENOMEM;
 }
 
-static void __net_exit __ip_vs_cleanup(struct net *net)
+static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list)
 {
-	struct netns_ipvs *ipvs = net_ipvs(net);
-
-	ip_vs_service_net_cleanup(ipvs);	/* ip_vs_flush() with locks */
-	ip_vs_conn_net_cleanup(ipvs);
-	ip_vs_app_net_cleanup(ipvs);
-	ip_vs_protocol_net_cleanup(ipvs);
-	ip_vs_control_net_cleanup(ipvs);
-	ip_vs_estimator_net_cleanup(ipvs);
-	IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
-	net->ipvs = NULL;
+	struct netns_ipvs *ipvs;
+	struct net *net;
+
+	ip_vs_service_nets_cleanup(net_list);	/* ip_vs_flush() with locks */
+	list_for_each_entry(net, net_list, exit_list) {
+		ipvs = net_ipvs(net);
+		ip_vs_conn_net_cleanup(ipvs);
+		ip_vs_app_net_cleanup(ipvs);
+		ip_vs_protocol_net_cleanup(ipvs);
+		ip_vs_control_net_cleanup(ipvs);
+		ip_vs_estimator_net_cleanup(ipvs);
+		IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen);
+		net->ipvs = NULL;
+	}
 }
 
 static int __net_init __ip_vs_dev_init(struct net *net)
@@ -2442,7 +2446,7 @@ static void __net_exit __ip_vs_dev_cleanup(struct net *net)
 
 static struct pernet_operations ipvs_core_ops = {
 	.init = __ip_vs_init,
-	.exit = __ip_vs_cleanup,
+	.exit_batch = __ip_vs_cleanup_batch,
 	.id   = &ip_vs_net_id,
 	.size = sizeof(struct netns_ipvs),
 };
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 8b48e7ce1c2c..153c77b5c4f5 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1607,14 +1607,20 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
 
 /*
  *	Delete service by {netns} in the service table.
- *	Called by __ip_vs_cleanup()
+ *	Called by __ip_vs_batch_cleanup()
  */
-void ip_vs_service_net_cleanup(struct netns_ipvs *ipvs)
+void ip_vs_service_nets_cleanup(struct list_head *net_list)
 {
+	struct netns_ipvs *ipvs;
+	struct net *net;
+
 	EnterFunction(2);
 	/* Check for "full" addressed entries */
 	mutex_lock(&__ip_vs_mutex);
-	ip_vs_flush(ipvs, true);
+	list_for_each_entry(net, net_list, exit_list) {
+		ipvs = net_ipvs(net);
+		ip_vs_flush(ipvs, true);
+	}
 	mutex_unlock(&__ip_vs_mutex);
 	LeaveFunction(2);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 79591b7db21d255db158afaa48c557dcab631a1c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Thu, 5 Sep 2019 04:01:12 +0300
Subject: spi: Add a PTP system timestamp to the transfer structure

SPI is one of the interfaces used to access devices which have a POSIX
clock driver (real time clocks, 1588 timers etc). The fact that the SPI
bus is slow is not what the main problem is, but rather the fact that
drivers don't take a constant amount of time in transferring data over
SPI. When there is a high delay in the readout of time, there will be
uncertainty in the value that has been read out of the peripheral.
When that delay is constant, the uncertainty can at least be
approximated with a certain accuracy which is fine more often than not.

Timing jitter occurs all over in the kernel code, and is mainly caused
by having to let go of the CPU for various reasons such as preemption,
servicing interrupts, going to sleep, etc. Another major reason is CPU
dynamic frequency scaling.

It turns out that the problem of retrieving time from a SPI peripheral
with high accuracy can be solved by the use of "PTP system
timestamping" - a mechanism to correlate the time when the device has
snapshotted its internal time counter with the Linux system time at that
same moment. This is sufficient for having a precise time measurement -
it is not necessary for the whole SPI transfer to be transmitted "as
fast as possible", or "as low-jitter as possible". The system has to be
low-jitter for a very short amount of time to be effective.

This patch introduces a PTP system timestamping mechanism in struct
spi_transfer. This is to be used by SPI device drivers when they need to
know the exact time at which the underlying device's time was
snapshotted. More often than not, SPI peripherals have a very exact
timing for when their SPI-to-interconnect bridge issues a transaction
for snapshotting and reading the time register, and that will be
dependent on when the SPI-to-interconnect bridge figures out that this
is what it should do, aka as soon as it sees byte N of the SPI transfer.
Since spi_device drivers are the ones who'd know best how the peripheral
behaves in this regard, expose a mechanism in spi_transfer which allows
them to specify which word (or word range) from the transfer should be
timestamped.

Add a default implementation of the PTP system timestamping in the SPI
core. This is not going to be satisfactory performance-wise, but should
at least increase the likelihood that SPI device drivers will use PTP
system timestamping in the future.
There are 3 entry points from the core towards the SPI controller
drivers:

- transfer_one: The driver is passed individual spi_transfers to
  execute. This is the easiest to timestamp.

- transfer_one_message: The core passes the driver an entire spi_message
  (a potential batch of spi_transfers). The core puts the same pre and
  post timestamp to all transfers within a message. This is not ideal,
  but nothing better can be done by default anyway, since the core has
  no insight into how the driver batches the transfers.

- transfer: Like transfer_one_message, but for unqueued drivers (i.e.
  the driver implements its own queue scheduling).

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://lore.kernel.org/r/20190905010114.26718-3-olteanv@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 127 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/spi/spi.h |  61 +++++++++++++++++++++++
 2 files changed, 188 insertions(+)

(limited to 'include')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index f9502dbbb5c1..9bb36c32cbf9 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1171,6 +1171,11 @@ static int spi_transfer_one_message(struct spi_controller *ctlr,
 		spi_statistics_add_transfer_stats(statm, xfer, ctlr);
 		spi_statistics_add_transfer_stats(stats, xfer, ctlr);
 
+		if (!ctlr->ptp_sts_supported) {
+			xfer->ptp_sts_word_pre = 0;
+			ptp_read_system_prets(xfer->ptp_sts);
+		}
+
 		if (xfer->tx_buf || xfer->rx_buf) {
 			reinit_completion(&ctlr->xfer_completion);
 
@@ -1197,6 +1202,11 @@ static int spi_transfer_one_message(struct spi_controller *ctlr,
 					xfer->len);
 		}
 
+		if (!ctlr->ptp_sts_supported) {
+			ptp_read_system_postts(xfer->ptp_sts);
+			xfer->ptp_sts_word_post = xfer->len;
+		}
+
 		trace_spi_transfer_stop(msg, xfer);
 
 		if (msg->status != -EINPROGRESS)
@@ -1265,6 +1275,7 @@ EXPORT_SYMBOL_GPL(spi_finalize_current_transfer);
  */
 static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 {
+	struct spi_transfer *xfer;
 	struct spi_message *msg;
 	bool was_busy = false;
 	unsigned long flags;
@@ -1391,6 +1402,13 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread)
 		goto out;
 	}
 
+	if (!ctlr->ptp_sts_supported && !ctlr->transfer_one) {
+		list_for_each_entry(xfer, &msg->transfers, transfer_list) {
+			xfer->ptp_sts_word_pre = 0;
+			ptp_read_system_prets(xfer->ptp_sts);
+		}
+	}
+
 	ret = ctlr->transfer_one_message(ctlr, msg);
 	if (ret) {
 		dev_err(&ctlr->dev,
@@ -1418,6 +1436,99 @@ static void spi_pump_messages(struct kthread_work *work)
 	__spi_pump_messages(ctlr, true);
 }
 
+/**
+ * spi_take_timestamp_pre - helper for drivers to collect the beginning of the
+ *			    TX timestamp for the requested byte from the SPI
+ *			    transfer. The frequency with which this function
+ *			    must be called (once per word, once for the whole
+ *			    transfer, once per batch of words etc) is arbitrary
+ *			    as long as the @tx buffer offset is greater than or
+ *			    equal to the requested byte at the time of the
+ *			    call. The timestamp is only taken once, at the
+ *			    first such call. It is assumed that the driver
+ *			    advances its @tx buffer pointer monotonically.
+ * @ctlr: Pointer to the spi_controller structure of the driver
+ * @xfer: Pointer to the transfer being timestamped
+ * @tx: Pointer to the current word within the xfer->tx_buf that the driver is
+ *	preparing to transmit right now.
+ * @irqs_off: If true, will disable IRQs and preemption for the duration of the
+ *	      transfer, for less jitter in time measurement. Only compatible
+ *	      with PIO drivers. If true, must follow up with
+ *	      spi_take_timestamp_post or otherwise system will crash.
+ *	      WARNING: for fully predictable results, the CPU frequency must
+ *	      also be under control (governor).
+ */
+void spi_take_timestamp_pre(struct spi_controller *ctlr,
+			    struct spi_transfer *xfer,
+			    const void *tx, bool irqs_off)
+{
+	u8 bytes_per_word = DIV_ROUND_UP(xfer->bits_per_word, 8);
+
+	if (!xfer->ptp_sts)
+		return;
+
+	if (xfer->timestamped_pre)
+		return;
+
+	if (tx < (xfer->tx_buf + xfer->ptp_sts_word_pre * bytes_per_word))
+		return;
+
+	/* Capture the resolution of the timestamp */
+	xfer->ptp_sts_word_pre = (tx - xfer->tx_buf) / bytes_per_word;
+
+	xfer->timestamped_pre = true;
+
+	if (irqs_off) {
+		local_irq_save(ctlr->irq_flags);
+		preempt_disable();
+	}
+
+	ptp_read_system_prets(xfer->ptp_sts);
+}
+EXPORT_SYMBOL_GPL(spi_take_timestamp_pre);
+
+/**
+ * spi_take_timestamp_post - helper for drivers to collect the end of the
+ *			     TX timestamp for the requested byte from the SPI
+ *			     transfer. Can be called with an arbitrary
+ *			     frequency: only the first call where @tx exceeds
+ *			     or is equal to the requested word will be
+ *			     timestamped.
+ * @ctlr: Pointer to the spi_controller structure of the driver
+ * @xfer: Pointer to the transfer being timestamped
+ * @tx: Pointer to the current word within the xfer->tx_buf that the driver has
+ *	just transmitted.
+ * @irqs_off: If true, will re-enable IRQs and preemption for the local CPU.
+ */
+void spi_take_timestamp_post(struct spi_controller *ctlr,
+			     struct spi_transfer *xfer,
+			     const void *tx, bool irqs_off)
+{
+	u8 bytes_per_word = DIV_ROUND_UP(xfer->bits_per_word, 8);
+
+	if (!xfer->ptp_sts)
+		return;
+
+	if (xfer->timestamped_post)
+		return;
+
+	if (tx < (xfer->tx_buf + xfer->ptp_sts_word_post * bytes_per_word))
+		return;
+
+	ptp_read_system_postts(xfer->ptp_sts);
+
+	if (irqs_off) {
+		local_irq_restore(ctlr->irq_flags);
+		preempt_enable();
+	}
+
+	/* Capture the resolution of the timestamp */
+	xfer->ptp_sts_word_post = (tx - xfer->tx_buf) / bytes_per_word;
+
+	xfer->timestamped_post = true;
+}
+EXPORT_SYMBOL_GPL(spi_take_timestamp_post);
+
 /**
  * spi_set_thread_rt - set the controller to pump at realtime priority
  * @ctlr: controller to boost priority of
@@ -1503,6 +1614,7 @@ EXPORT_SYMBOL_GPL(spi_get_next_queued_message);
  */
 void spi_finalize_current_message(struct spi_controller *ctlr)
 {
+	struct spi_transfer *xfer;
 	struct spi_message *mesg;
 	unsigned long flags;
 	int ret;
@@ -1511,6 +1623,13 @@ void spi_finalize_current_message(struct spi_controller *ctlr)
 	mesg = ctlr->cur_msg;
 	spin_unlock_irqrestore(&ctlr->queue_lock, flags);
 
+	if (!ctlr->ptp_sts_supported && !ctlr->transfer_one) {
+		list_for_each_entry(xfer, &mesg->transfers, transfer_list) {
+			ptp_read_system_postts(xfer->ptp_sts);
+			xfer->ptp_sts_word_post = xfer->len;
+		}
+	}
+
 	spi_unmap_msg(ctlr, mesg);
 
 	if (ctlr->cur_msg_prepared && ctlr->unprepare_message) {
@@ -3273,6 +3392,7 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message)
 static int __spi_async(struct spi_device *spi, struct spi_message *message)
 {
 	struct spi_controller *ctlr = spi->controller;
+	struct spi_transfer *xfer;
 
 	/*
 	 * Some controllers do not support doing regular SPI transfers. Return
@@ -3288,6 +3408,13 @@ static int __spi_async(struct spi_device *spi, struct spi_message *message)
 
 	trace_spi_message_submit(message);
 
+	if (!ctlr->ptp_sts_supported) {
+		list_for_each_entry(xfer, &message->transfers, transfer_list) {
+			xfer->ptp_sts_word_pre = 0;
+			ptp_read_system_prets(xfer->ptp_sts);
+		}
+	}
+
 	return ctlr->transfer(spi, message);
 }
 
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index af4f265d0f67..27f6b046cf92 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -13,6 +13,7 @@
 #include <linux/completion.h>
 #include <linux/scatterlist.h>
 #include <linux/gpio/consumer.h>
+#include <linux/ptp_clock_kernel.h>
 
 struct dma_chan;
 struct property_entry;
@@ -409,6 +410,12 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @fw_translate_cs: If the boot firmware uses different numbering scheme
  *	what Linux expects, this optional hook can be used to translate
  *	between the two.
+ * @ptp_sts_supported: If the driver sets this to true, it must provide a
+ *	time snapshot in @spi_transfer->ptp_sts as close as possible to the
+ *	moment in time when @spi_transfer->ptp_sts_word_pre and
+ *	@spi_transfer->ptp_sts_word_post were transmitted.
+ *	If the driver does not set this, the SPI core takes the snapshot as
+ *	close to the driver hand-over as possible.
  *
  * Each SPI controller can communicate with one or more @spi_device
  * children.  These make a small bus, sharing MOSI, MISO and SCK signals
@@ -604,6 +611,15 @@ struct spi_controller {
 	void			*dummy_tx;
 
 	int (*fw_translate_cs)(struct spi_controller *ctlr, unsigned cs);
+
+	/*
+	 * Driver sets this field to indicate it is able to snapshot SPI
+	 * transfers (needed e.g. for reading the time of POSIX clocks)
+	 */
+	bool			ptp_sts_supported;
+
+	/* Interrupt enable state during PTP system timestamping */
+	unsigned long		irq_flags;
 };
 
 static inline void *spi_controller_get_devdata(struct spi_controller *ctlr)
@@ -644,6 +660,14 @@ extern struct spi_message *spi_get_next_queued_message(struct spi_controller *ct
 extern void spi_finalize_current_message(struct spi_controller *ctlr);
 extern void spi_finalize_current_transfer(struct spi_controller *ctlr);
 
+/* Helper calls for driver to timestamp transfer */
+void spi_take_timestamp_pre(struct spi_controller *ctlr,
+			    struct spi_transfer *xfer,
+			    const void *tx, bool irqs_off);
+void spi_take_timestamp_post(struct spi_controller *ctlr,
+			     struct spi_transfer *xfer,
+			     const void *tx, bool irqs_off);
+
 /* the spi driver core manages memory for the spi_controller classdev */
 extern struct spi_controller *__spi_alloc_controller(struct device *host,
 						unsigned int size, bool slave);
@@ -753,6 +777,35 @@ extern void spi_res_release(struct spi_controller *ctlr,
  * @transfer_list: transfers are sequenced through @spi_message.transfers
  * @tx_sg: Scatterlist for transmit, currently not for client use
  * @rx_sg: Scatterlist for receive, currently not for client use
+ * @ptp_sts_word_pre: The word (subject to bits_per_word semantics) offset
+ *	within @tx_buf for which the SPI device is requesting that the time
+ *	snapshot for this transfer begins. Upon completing the SPI transfer,
+ *	this value may have changed compared to what was requested, depending
+ *	on the available snapshotting resolution (DMA transfer,
+ *	@ptp_sts_supported is false, etc).
+ * @ptp_sts_word_post: See @ptp_sts_word_post. The two can be equal (meaning
+ *	that a single byte should be snapshotted).
+ *	If the core takes care of the timestamp (if @ptp_sts_supported is false
+ *	for this controller), it will set @ptp_sts_word_pre to 0, and
+ *	@ptp_sts_word_post to the length of the transfer. This is done
+ *	purposefully (instead of setting to spi_transfer->len - 1) to denote
+ *	that a transfer-level snapshot taken from within the driver may still
+ *	be of higher quality.
+ * @ptp_sts: Pointer to a memory location held by the SPI slave device where a
+ *	PTP system timestamp structure may lie. If drivers use PIO or their
+ *	hardware has some sort of assist for retrieving exact transfer timing,
+ *	they can (and should) assert @ptp_sts_supported and populate this
+ *	structure using the ptp_read_system_*ts helper functions.
+ *	The timestamp must represent the time at which the SPI slave device has
+ *	processed the word, i.e. the "pre" timestamp should be taken before
+ *	transmitting the "pre" word, and the "post" timestamp after receiving
+ *	transmit confirmation from the controller for the "post" word.
+ * @timestamped_pre: Set by the SPI controller driver to denote it has acted
+ *	upon the @ptp_sts request. Not set when the SPI core has taken care of
+ *	the task. SPI device drivers are free to print a warning if this comes
+ *	back unset and they need the better resolution.
+ * @timestamped_post: See above. The reason why both exist is that these
+ *	booleans are also used to keep state in the core SPI logic.
  *
  * SPI transfers always write the same number of bytes as they read.
  * Protocol drivers should always provide @rx_buf and/or @tx_buf.
@@ -842,6 +895,14 @@ struct spi_transfer {
 
 	u32		effective_speed_hz;
 
+	unsigned int	ptp_sts_word_pre;
+	unsigned int	ptp_sts_word_post;
+
+	struct ptp_system_timestamp *ptp_sts;
+
+	bool		timestamped_pre;
+	bool		timestamped_post;
+
 	struct list_head transfer_list;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From bacb7e1855969bba78b32302453d2cc8ba0bc403 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Oct 2019 14:20:34 -0700
Subject: Revert "tun: call dev_get_valid_name() before register_netdevice()"

This reverts commit 0ad646c81b2182f7fa67ec0c8c825e0ee165696d.

As noticed by Jakub, this is no longer needed after
commit 11fc7d5a0a2d ("tun: fix memory leak in error path")

This no longer exports dev_get_valid_name() for the exclusive
use of tun driver.

Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/tun.c         | 3 ---
 include/linux/netdevice.h | 3 ---
 net/core/dev.c            | 5 ++---
 3 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1e541b08b136..0413d182d782 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2788,9 +2788,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 		if (!dev)
 			return -ENOMEM;
-		err = dev_get_valid_name(net, dev, name);
-		if (err < 0)
-			goto err_free_dev;
 
 		dev_net_set(dev, net);
 		dev->rtnl_link_ops = &tun_link_ops;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fe45b2c72315..3207e0b9ec4e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4113,9 +4113,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 				    unsigned char name_assign_type,
 				    void (*setup)(struct net_device *),
 				    unsigned int txqs, unsigned int rxqs);
-int dev_get_valid_name(struct net *net, struct net_device *dev,
-		       const char *name);
-
 #define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
 	alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 7d05e042c6ba..8bc3dce71fc0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1249,8 +1249,8 @@ int dev_alloc_name(struct net_device *dev, const char *name)
 }
 EXPORT_SYMBOL(dev_alloc_name);
 
-int dev_get_valid_name(struct net *net, struct net_device *dev,
-		       const char *name)
+static int dev_get_valid_name(struct net *net, struct net_device *dev,
+			      const char *name)
 {
 	BUG_ON(!net);
 
@@ -1266,7 +1266,6 @@ int dev_get_valid_name(struct net *net, struct net_device *dev,
 
 	return 0;
 }
-EXPORT_SYMBOL(dev_get_valid_name);
 
 /**
  *	dev_change_name - change name of a device
-- 
cgit v1.2.3-59-g8ed1b


From fd1ac07f3f17fbbc2f08e3b43951bed937d86a7b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 4 Oct 2019 00:21:57 +0300
Subject: xfrm: ifdef setsockopt(UDP_ENCAP_ESPINUDP/UDP_ENCAP_ESPINUDP_NON_IKE)

If IPsec is not configured, there is no reason to delay the inevitable.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 7 -------
 net/ipv4/udp.c     | 2 ++
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index aa08a7a5f6ac..dda3c025452e 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1613,13 +1613,6 @@ static inline int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optv
 {
  	return -ENOPROTOOPT;
 }
-
-static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
-{
- 	/* should not happen */
- 	kfree_skb(skb);
-	return 0;
-}
 #endif
 
 struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf755156a684..f1c514cb4e87 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2520,9 +2520,11 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 	case UDP_ENCAP:
 		switch (val) {
 		case 0:
+#ifdef CONFIG_XFRM
 		case UDP_ENCAP_ESPINUDP:
 		case UDP_ENCAP_ESPINUDP_NON_IKE:
 			up->encap_rcv = xfrm4_udp_encap_rcv;
+#endif
 			/* FALLTHROUGH */
 		case UDP_ENCAP_L2TPINUDP:
 			up->encap_type = val;
-- 
cgit v1.2.3-59-g8ed1b


From 4b7740324ed86aa4b02cef134da4b79078294d72 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 8 Oct 2019 19:27:33 +0800
Subject: sctp: add SCTP_ADDR_ADDED event

A helper sctp_ulpevent_nofity_peer_addr_change() will be extracted
to make peer_addr_change event and enqueue it, and the helper will
be called in sctp_assoc_add_peer() to send SCTP_ADDR_ADDED event.

This event is described in rfc6458#section-6.1.2:

  SCTP_ADDR_ADDED:  The address is now part of the association.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/sctp/ulpevent.h |  9 ++-------
 net/sctp/associola.c        | 19 ++++++-------------
 net/sctp/ulpevent.c         | 18 +++++++++++++++++-
 3 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index e1a92c4610f3..e6ead1ed74dd 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -80,13 +80,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
 	struct sctp_chunk *chunk,
 	gfp_t gfp);
 
-struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
-	const struct sctp_association *asoc,
-	const struct sockaddr_storage *aaddr,
-	int flags,
-	int state,
-	int error,
-	gfp_t gfp);
+void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport,
+					   int state, int error);
 
 struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
 	const struct sctp_association *asoc,
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index d2ffc9a0ba3a..55aad70bb2d3 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -707,6 +707,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
 	asoc->peer.transport_count++;
 
+	sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0);
+
 	/* If we do not yet have a primary path, set one.  */
 	if (!asoc->peer.primary_path) {
 		sctp_assoc_set_primary(asoc, peer);
@@ -781,10 +783,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 				  enum sctp_transport_cmd command,
 				  sctp_sn_error_t error)
 {
-	struct sctp_ulpevent *event;
-	struct sockaddr_storage addr;
-	int spc_state = 0;
 	bool ulp_notify = true;
+	int spc_state = 0;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -836,16 +836,9 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Generate and send a SCTP_PEER_ADDR_CHANGE notification
 	 * to the user.
 	 */
-	if (ulp_notify) {
-		memset(&addr, 0, sizeof(struct sockaddr_storage));
-		memcpy(&addr, &transport->ipaddr,
-		       transport->af_specific->sockaddr_len);
-
-		event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-					0, spc_state, error, GFP_ATOMIC);
-		if (event)
-			asoc->stream.si->enqueue_event(&asoc->ulpq, event);
-	}
+	if (ulp_notify)
+		sctp_ulpevent_nofity_peer_addr_change(transport,
+						      spc_state, error);
 
 	/* Select new active and retran paths. */
 	sctp_select_active_and_retran_path(asoc);
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index e0cc1edf49a0..f07b986ed63e 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -238,7 +238,7 @@ fail:
  * When a destination address on a multi-homed peer encounters a change
  * an interface details event is sent.
  */
-struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
+static struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
 	const struct sctp_association *asoc,
 	const struct sockaddr_storage *aaddr,
 	int flags, int state, int error, gfp_t gfp)
@@ -336,6 +336,22 @@ fail:
 	return NULL;
 }
 
+void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport,
+					   int state, int error)
+{
+	struct sctp_association *asoc = transport->asoc;
+	struct sockaddr_storage addr;
+	struct sctp_ulpevent *event;
+
+	memset(&addr, 0, sizeof(struct sockaddr_storage));
+	memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
+
+	event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, state,
+						    error, GFP_ATOMIC);
+	if (event)
+		asoc->stream.si->enqueue_event(&asoc->ulpq, event);
+}
+
 /* Create and initialize an SCTP_REMOTE_ERROR notification.
  *
  * Note: This assumes that the chunk->skb->data already points to the
-- 
cgit v1.2.3-59-g8ed1b


From b6e6b5f1da7e8d092f86a4351802c27c0170c5a5 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 8 Oct 2019 19:27:36 +0800
Subject: sctp: add SCTP_SEND_FAILED_EVENT event

This patch is to add a new event SCTP_SEND_FAILED_EVENT described in
rfc6458#section-6.1.11. It's a update of SCTP_SEND_FAILED event:

  struct sctp_sndrcvinfo ssf_info is replaced with
  struct sctp_sndinfo ssfe_info in struct sctp_send_failed_event.

SCTP_SEND_FAILED is being deprecated, but we don't remove it in this
patch. Both are being processed in sctp_datamsg_destroy() when the
corresp event flag is set.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/sctp/ulpevent.h |  7 +++++++
 include/uapi/linux/sctp.h   | 16 +++++++++++++++-
 net/sctp/chunk.c            | 40 +++++++++++++++++++---------------------
 net/sctp/ulpevent.c         | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 80 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index e6ead1ed74dd..0b032b92da0b 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -95,6 +95,13 @@ struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
 	__u32 error,
 	gfp_t gfp);
 
+struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event(
+	const struct sctp_association *asoc,
+	struct sctp_chunk *chunk,
+	__u16 flags,
+	__u32 error,
+	gfp_t gfp);
+
 struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
 	const struct sctp_association *asoc,
 	__u16 flags,
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6d5b164af55c..6bce7f9837a9 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -449,6 +449,16 @@ struct sctp_send_failed {
 	__u8 ssf_data[0];
 };
 
+struct sctp_send_failed_event {
+	__u16 ssf_type;
+	__u16 ssf_flags;
+	__u32 ssf_length;
+	__u32 ssf_error;
+	struct sctp_sndinfo ssfe_info;
+	sctp_assoc_t ssf_assoc_id;
+	__u8 ssf_data[0];
+};
+
 /*
  *   ssf_flags: 16 bits (unsigned integer)
  *
@@ -605,6 +615,7 @@ struct sctp_event_subscribe {
 	__u8 sctp_stream_reset_event;
 	__u8 sctp_assoc_reset_event;
 	__u8 sctp_stream_change_event;
+	__u8 sctp_send_failure_event_event;
 };
 
 /*
@@ -632,6 +643,7 @@ union sctp_notification {
 	struct sctp_stream_reset_event sn_strreset_event;
 	struct sctp_assoc_reset_event sn_assocreset_event;
 	struct sctp_stream_change_event sn_strchange_event;
+	struct sctp_send_failed_event sn_send_failed_event;
 };
 
 /* Section 5.3.1
@@ -667,7 +679,9 @@ enum sctp_sn_type {
 #define SCTP_ASSOC_RESET_EVENT		SCTP_ASSOC_RESET_EVENT
 	SCTP_STREAM_CHANGE_EVENT,
 #define SCTP_STREAM_CHANGE_EVENT	SCTP_STREAM_CHANGE_EVENT
-	SCTP_SN_TYPE_MAX	= SCTP_STREAM_CHANGE_EVENT,
+	SCTP_SEND_FAILED_EVENT,
+#define SCTP_SEND_FAILED_EVENT		SCTP_SEND_FAILED_EVENT
+	SCTP_SN_TYPE_MAX	= SCTP_SEND_FAILED_EVENT,
 #define SCTP_SN_TYPE_MAX		SCTP_SN_TYPE_MAX
 };
 
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index cc0405c79dfc..cc3ce5d80b08 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -75,41 +75,39 @@ static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
 	struct list_head *pos, *temp;
 	struct sctp_chunk *chunk;
 	struct sctp_ulpevent *ev;
-	int error = 0, notify;
-
-	/* If we failed, we may need to notify. */
-	notify = msg->send_failed ? -1 : 0;
+	int error, sent;
 
 	/* Release all references. */
 	list_for_each_safe(pos, temp, &msg->chunks) {
 		list_del_init(pos);
 		chunk = list_entry(pos, struct sctp_chunk, frag_list);
-		/* Check whether we _really_ need to notify. */
-		if (notify < 0) {
-			asoc = chunk->asoc;
-			if (msg->send_error)
-				error = msg->send_error;
-			else
-				error = asoc->outqueue.error;
-
-			notify = sctp_ulpevent_type_enabled(asoc->subscribe,
-							    SCTP_SEND_FAILED);
+
+		if (!msg->send_failed) {
+			sctp_chunk_put(chunk);
+			continue;
 		}
 
-		/* Generate a SEND FAILED event only if enabled. */
-		if (notify > 0) {
-			int sent;
-			if (chunk->has_tsn)
-				sent = SCTP_DATA_SENT;
-			else
-				sent = SCTP_DATA_UNSENT;
+		asoc = chunk->asoc;
+		error = msg->send_error ?: asoc->outqueue.error;
+		sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT;
 
+		if (sctp_ulpevent_type_enabled(asoc->subscribe,
+					       SCTP_SEND_FAILED)) {
 			ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent,
 							    error, GFP_ATOMIC);
 			if (ev)
 				asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 		}
 
+		if (sctp_ulpevent_type_enabled(asoc->subscribe,
+					       SCTP_SEND_FAILED_EVENT)) {
+			ev = sctp_ulpevent_make_send_failed_event(asoc, chunk,
+								  sent, error,
+								  GFP_ATOMIC);
+			if (ev)
+				asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
+		}
+
 		sctp_chunk_put(chunk);
 	}
 
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index f07b986ed63e..c82dbdcf13f2 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -527,6 +527,45 @@ fail:
 	return NULL;
 }
 
+struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event(
+	const struct sctp_association *asoc, struct sctp_chunk *chunk,
+	__u16 flags, __u32 error, gfp_t gfp)
+{
+	struct sctp_send_failed_event *ssf;
+	struct sctp_ulpevent *event;
+	struct sk_buff *skb;
+	int len;
+
+	skb = skb_copy_expand(chunk->skb, sizeof(*ssf), 0, gfp);
+	if (!skb)
+		return NULL;
+
+	len = ntohs(chunk->chunk_hdr->length);
+	len -= sctp_datachk_len(&asoc->stream);
+
+	skb_pull(skb, sctp_datachk_len(&asoc->stream));
+	event = sctp_skb2event(skb);
+	sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize);
+
+	ssf = skb_push(skb, sizeof(*ssf));
+	ssf->ssf_type = SCTP_SEND_FAILED_EVENT;
+	ssf->ssf_flags = flags;
+	ssf->ssf_length = sizeof(*ssf) + len;
+	skb_trim(skb, ssf->ssf_length);
+	ssf->ssf_error = error;
+
+	ssf->ssfe_info.snd_sid = chunk->sinfo.sinfo_stream;
+	ssf->ssfe_info.snd_ppid = chunk->sinfo.sinfo_ppid;
+	ssf->ssfe_info.snd_context = chunk->sinfo.sinfo_context;
+	ssf->ssfe_info.snd_assoc_id = chunk->sinfo.sinfo_assoc_id;
+	ssf->ssfe_info.snd_flags = chunk->chunk_hdr->flags;
+
+	sctp_ulpevent_set_owner(event, asoc);
+	ssf->ssf_assoc_id = sctp_assoc2id(asoc);
+
+	return event;
+}
+
 /* Create and initialize a SCTP_SHUTDOWN_EVENT notification.
  *
  * Socket Extensions for SCTP - draft-01
-- 
cgit v1.2.3-59-g8ed1b


From 690a6ca7df3de7b90546bc10a620d1ac8ccaa1a1 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 8 Oct 2019 21:03:14 -0700
Subject: DIM: fix dim.h kernel-doc and headers

Lots of fixes to kernel-doc in structs, enums, and functions.
Also add header files that are being used but not yet #included.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Yamin Friedman <yaminf@mellanox.com>
Cc: Tal Gilboa <talgi@mellanox.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: linux-rdma@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/linux/dim.h | 63 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index 9fa4b3f88c39..b698266d0035 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -4,22 +4,26 @@
 #ifndef DIM_H
 #define DIM_H
 
+#include <linux/bits.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
 
-/**
+/*
  * Number of events between DIM iterations.
  * Causes a moderation of the algorithm run.
  */
 #define DIM_NEVENTS 64
 
-/**
+/*
  * Is a difference between values justifies taking an action.
  * We consider 10% difference as significant.
  */
 #define IS_SIGNIFICANT_DIFF(val, ref) \
 	(((100UL * abs((val) - (ref))) / (ref)) > 10)
 
-/**
+/*
  * Calculate the gap between two values.
  * Take wrap-around and variable size into consideration.
  */
@@ -27,12 +31,13 @@
 		& (BIT_ULL(bits) - 1))
 
 /**
- * Structure for CQ moderation values.
+ * struct dim_cq_moder - Structure for CQ moderation values.
  * Used for communications between DIM and its consumer.
  *
  * @usec: CQ timer suggestion (by DIM)
  * @pkts: CQ packet counter suggestion (by DIM)
- * @cq_period_mode: CQ priod count mode (from CQE/EQE)
+ * @comps: Completion counter
+ * @cq_period_mode: CQ period count mode (from CQE/EQE)
  */
 struct dim_cq_moder {
 	u16 usec;
@@ -42,13 +47,14 @@ struct dim_cq_moder {
 };
 
 /**
- * Structure for DIM sample data.
+ * struct dim_sample - Structure for DIM sample data.
  * Used for communications between DIM and its consumer.
  *
  * @time: Sample timestamp
  * @pkt_ctr: Number of packets
  * @byte_ctr: Number of bytes
  * @event_ctr: Number of events
+ * @comp_ctr: Current completion counter
  */
 struct dim_sample {
 	ktime_t time;
@@ -59,12 +65,14 @@ struct dim_sample {
 };
 
 /**
- * Structure for DIM stats.
+ * struct dim_stats - Structure for DIM stats.
  * Used for holding current measured rates.
  *
  * @ppms: Packets per msec
  * @bpms: Bytes per msec
  * @epms: Events per msec
+ * @cpms: Completions per msec
+ * @cpe_ratio: Ratio of completions to events
  */
 struct dim_stats {
 	int ppms; /* packets per msec */
@@ -75,12 +83,13 @@ struct dim_stats {
 };
 
 /**
- * Main structure for dynamic interrupt moderation (DIM).
+ * struct dim - Main structure for dynamic interrupt moderation (DIM).
  * Used for holding all information about a specific DIM instance.
  *
  * @state: Algorithm state (see below)
  * @prev_stats: Measured rates from previous iteration (for comparison)
  * @start_sample: Sampled data at start of current iteration
+ * @measuring_sample: A &dim_sample that is used to update the current events
  * @work: Work to perform on action required
  * @priv: A pointer to the struct that points to dim
  * @profile_ix: Current moderation profile
@@ -106,24 +115,21 @@ struct dim {
 };
 
 /**
- * enum dim_cq_period_mode
- *
- * These are the modes for CQ period count.
+ * enum dim_cq_period_mode - Modes for CQ period count
  *
  * @DIM_CQ_PERIOD_MODE_START_FROM_EQE: Start counting from EQE
  * @DIM_CQ_PERIOD_MODE_START_FROM_CQE: Start counting from CQE (implies timer reset)
  * @DIM_CQ_PERIOD_NUM_MODES: Number of modes
  */
-enum {
+enum dim_cq_period_mode {
 	DIM_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
 	DIM_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
 	DIM_CQ_PERIOD_NUM_MODES
 };
 
 /**
- * enum dim_state
+ * enum dim_state - DIM algorithm states
  *
- * These are the DIM algorithm states.
  * These will determine if the algorithm is in a valid state to start an iteration.
  *
  * @DIM_START_MEASURE: This is the first iteration (also after applying a new profile)
@@ -131,16 +137,15 @@ enum {
  * need to perform an action
  * @DIM_APPLY_NEW_PROFILE: DIM consumer is currently applying a profile - no need to measure
  */
-enum {
+enum dim_state {
 	DIM_START_MEASURE,
 	DIM_MEASURE_IN_PROGRESS,
 	DIM_APPLY_NEW_PROFILE,
 };
 
 /**
- * enum dim_tune_state
+ * enum dim_tune_state - DIM algorithm tune states
  *
- * These are the DIM algorithm tune states.
  * These will determine which action the algorithm should perform.
  *
  * @DIM_PARKING_ON_TOP: Algorithm found a local top point - exit on significant difference
@@ -148,7 +153,7 @@ enum {
  * @DIM_GOING_RIGHT: Algorithm is currently trying higher moderation levels
  * @DIM_GOING_LEFT: Algorithm is currently trying lower moderation levels
  */
-enum {
+enum dim_tune_state {
 	DIM_PARKING_ON_TOP,
 	DIM_PARKING_TIRED,
 	DIM_GOING_RIGHT,
@@ -156,25 +161,23 @@ enum {
 };
 
 /**
- * enum dim_stats_state
+ * enum dim_stats_state - DIM algorithm statistics states
  *
- * These are the DIM algorithm statistics states.
  * These will determine the verdict of current iteration.
  *
  * @DIM_STATS_WORSE: Current iteration shows worse performance than before
- * @DIM_STATS_WORSE: Current iteration shows same performance than before
- * @DIM_STATS_WORSE: Current iteration shows better performance than before
+ * @DIM_STATS_SAME:  Current iteration shows same performance than before
+ * @DIM_STATS_BETTER: Current iteration shows better performance than before
  */
-enum {
+enum dim_stats_state {
 	DIM_STATS_WORSE,
 	DIM_STATS_SAME,
 	DIM_STATS_BETTER,
 };
 
 /**
- * enum dim_step_result
+ * enum dim_step_result - DIM algorithm step results
  *
- * These are the DIM algorithm step results.
  * These describe the result of a step.
  *
  * @DIM_STEPPED: Performed a regular step
@@ -182,7 +185,7 @@ enum {
  * tired parking
  * @DIM_ON_EDGE: Stepped to the most left/right profile
  */
-enum {
+enum dim_step_result {
 	DIM_STEPPED,
 	DIM_TOO_TIRED,
 	DIM_ON_EDGE,
@@ -199,7 +202,7 @@ enum {
 bool dim_on_top(struct dim *dim);
 
 /**
- *	dim_turn - change profile alterning direction
+ *	dim_turn - change profile altering direction
  *	@dim: DIM context
  *
  * Go left if we were going right and vice-versa.
@@ -238,7 +241,7 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end,
 		    struct dim_stats *curr_stats);
 
 /**
- *	dim_update_sample - set a sample's fields with give values
+ *	dim_update_sample - set a sample's fields with given values
  *	@event_ctr: number of events to set
  *	@packets: number of packets to set
  *	@bytes: number of bytes to set
@@ -304,8 +307,8 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
  *	@end_sample: Current data measurement
  *
  * Called by the consumer.
- * This is the main logic of the algorithm, where data is processed in order to decide on next
- * required action.
+ * This is the main logic of the algorithm, where data is processed in order
+ * to decide on next required action.
  */
 void net_dim(struct dim *dim, struct dim_sample end_sample);
 
-- 
cgit v1.2.3-59-g8ed1b


From a2351c5d86d7acf8eef17fba4ac1fc5b305a37c0 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 9 Oct 2019 10:07:43 +0200
Subject: net/smc: separate SMCD and SMCR link group lists

Currently SMCD and SMCR link groups are maintained in one list.
To facilitate abnormal termination handling they are split into
a separate list for SMCR link groups and separate lists for SMCD
link groups per SMCD device.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/smc.h  |  1 +
 net/smc/smc_core.c | 24 +++++++++++++++++-------
 net/smc/smc_ism.c  |  1 +
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/smc.h b/include/net/smc.h
index bd9c0fb3b577..c08e8c415673 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -75,6 +75,7 @@ struct smcd_dev {
 	struct workqueue_struct *event_wq;
 	u8 pnetid[SMC_MAX_PNETID_LEN];
 	bool pnetid_by_user;
+	struct list_head lgr_list;
 };
 
 struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 4ca50ddf8d16..46d679542b87 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -198,6 +198,7 @@ static void smc_lgr_free_work(struct work_struct *work)
 static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 {
 	struct smc_link_group *lgr;
+	struct list_head *lgr_list;
 	struct smc_link *lnk;
 	u8 rndvec[3];
 	int rc = 0;
@@ -233,6 +234,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		/* SMC-D specific settings */
 		lgr->peer_gid = ini->ism_gid;
 		lgr->smcd = ini->ism_dev;
+		lgr_list = &ini->ism_dev->lgr_list;
 	} else {
 		/* SMC-R specific settings */
 		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
@@ -245,6 +247,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		lnk->link_id = SMC_SINGLE_LINK;
 		lnk->smcibdev = ini->ib_dev;
 		lnk->ibport = ini->ib_port;
+		lgr_list = &smc_lgr_list.list;
 		lnk->path_mtu =
 			ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
 		if (!ini->ib_dev->initialized)
@@ -275,7 +278,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 	}
 	smc->conn.lgr = lgr;
 	spin_lock_bh(&smc_lgr_list.lock);
-	list_add(&lgr->list, &smc_lgr_list.list);
+	list_add(&lgr->list, lgr_list);
 	spin_unlock_bh(&smc_lgr_list.lock);
 	return 0;
 
@@ -512,9 +515,8 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
 
 	/* run common cleanup function and build free list */
 	spin_lock_bh(&smc_lgr_list.lock);
-	list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) {
-		if (lgr->is_smcd && lgr->smcd == dev &&
-		    (!peer_gid || lgr->peer_gid == peer_gid) &&
+	list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
+		if ((!peer_gid || lgr->peer_gid == peer_gid) &&
 		    (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
 			__smc_lgr_terminate(lgr);
 			list_move(&lgr->list, &lgr_free_list);
@@ -604,10 +606,12 @@ static bool smcd_lgr_match(struct smc_link_group *lgr,
 int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 {
 	struct smc_connection *conn = &smc->conn;
+	struct list_head *lgr_list;
 	struct smc_link_group *lgr;
 	enum smc_lgr_role role;
 	int rc = 0;
 
+	lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
 	ini->cln_first_contact = SMC_FIRST_CONTACT;
 	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
 	if (role == SMC_CLNT && ini->srv_first_contact)
@@ -616,7 +620,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 
 	/* determine if an existing link group can be reused */
 	spin_lock_bh(&smc_lgr_list.lock);
-	list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+	list_for_each_entry(lgr, lgr_list, list) {
 		write_lock_bh(&lgr->conns_lock);
 		if ((ini->is_smcd ?
 		     smcd_lgr_match(lgr, ini->ism_dev, ini->ism_gid) :
@@ -1029,11 +1033,17 @@ void smc_core_exit(void)
 {
 	struct smc_link_group *lgr, *lg;
 	LIST_HEAD(lgr_freeing_list);
+	struct smcd_dev *smcd;
 
 	spin_lock_bh(&smc_lgr_list.lock);
-	if (!list_empty(&smc_lgr_list.list))
-		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
+	list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
 	spin_unlock_bh(&smc_lgr_list.lock);
+
+	spin_lock(&smcd_dev_list.lock);
+	list_for_each_entry(smcd, &smcd_dev_list.list, list)
+		list_splice_init(&smcd->lgr_list, &lgr_freeing_list);
+	spin_unlock(&smcd_dev_list.lock);
+
 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
 		list_del_init(&lgr->list);
 		if (!lgr->is_smcd) {
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index e89e918b88e0..674eb5ae2320 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -287,6 +287,7 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
 
 	spin_lock_init(&smcd->lock);
 	INIT_LIST_HEAD(&smcd->vlan);
+	INIT_LIST_HEAD(&smcd->lgr_list);
 	smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
 						 WQ_MEM_RECLAIM, name);
 	if (!smcd->event_wq) {
-- 
cgit v1.2.3-59-g8ed1b


From a0a62ee15a829ebf8aeec55a4f1688230439b3e0 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 9 Oct 2019 10:07:44 +0200
Subject: net/smc: separate locks for SMCD and SMCR link group lists

This patch introduces separate locks for the split SMCD and SMCR
link group lists.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/smc.h  |  1 +
 net/smc/smc_core.c | 57 ++++++++++++++++++++++++++++++++++++++++--------------
 net/smc/smc_ism.c  |  1 +
 3 files changed, 44 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/smc.h b/include/net/smc.h
index c08e8c415673..438bb0261f45 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -76,6 +76,7 @@ struct smcd_dev {
 	u8 pnetid[SMC_MAX_PNETID_LEN];
 	bool pnetid_by_user;
 	struct list_head lgr_list;
+	spinlock_t lgr_lock;
 };
 
 struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 46d679542b87..949b1914e11a 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -42,6 +42,19 @@ static struct smc_lgr_list smc_lgr_list = {	/* established link groups */
 static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
 			 struct smc_buf_desc *buf_desc);
 
+/* return head of link group list and its lock for a given link group */
+static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
+						  spinlock_t **lgr_lock)
+{
+	if (lgr->is_smcd) {
+		*lgr_lock = &lgr->smcd->lgr_lock;
+		return &lgr->smcd->lgr_list;
+	}
+
+	*lgr_lock = &smc_lgr_list.lock;
+	return &smc_lgr_list.list;
+}
+
 static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
 {
 	/* client link group creation always follows the server link group
@@ -157,19 +170,21 @@ static void smc_lgr_free_work(struct work_struct *work)
 	struct smc_link_group *lgr = container_of(to_delayed_work(work),
 						  struct smc_link_group,
 						  free_work);
+	spinlock_t *lgr_lock;
 	bool conns;
 
-	spin_lock_bh(&smc_lgr_list.lock);
+	smc_lgr_list_head(lgr, &lgr_lock);
+	spin_lock_bh(lgr_lock);
 	read_lock_bh(&lgr->conns_lock);
 	conns = RB_EMPTY_ROOT(&lgr->conns_all);
 	read_unlock_bh(&lgr->conns_lock);
 	if (!conns) { /* number of lgr connections is no longer zero */
-		spin_unlock_bh(&smc_lgr_list.lock);
+		spin_unlock_bh(lgr_lock);
 		return;
 	}
 	if (!list_empty(&lgr->list))
 		list_del_init(&lgr->list); /* remove from smc_lgr_list */
-	spin_unlock_bh(&smc_lgr_list.lock);
+	spin_unlock_bh(lgr_lock);
 
 	if (!lgr->is_smcd && !lgr->terminating)	{
 		struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
@@ -200,6 +215,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 	struct smc_link_group *lgr;
 	struct list_head *lgr_list;
 	struct smc_link *lnk;
+	spinlock_t *lgr_lock;
 	u8 rndvec[3];
 	int rc = 0;
 	int i;
@@ -235,6 +251,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		lgr->peer_gid = ini->ism_gid;
 		lgr->smcd = ini->ism_dev;
 		lgr_list = &ini->ism_dev->lgr_list;
+		lgr_lock = &lgr->smcd->lgr_lock;
 	} else {
 		/* SMC-R specific settings */
 		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
@@ -248,6 +265,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		lnk->smcibdev = ini->ib_dev;
 		lnk->ibport = ini->ib_port;
 		lgr_list = &smc_lgr_list.list;
+		lgr_lock = &smc_lgr_list.lock;
 		lnk->path_mtu =
 			ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
 		if (!ini->ib_dev->initialized)
@@ -277,9 +295,9 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 			goto destroy_qp;
 	}
 	smc->conn.lgr = lgr;
-	spin_lock_bh(&smc_lgr_list.lock);
+	spin_lock_bh(lgr_lock);
 	list_add(&lgr->list, lgr_list);
-	spin_unlock_bh(&smc_lgr_list.lock);
+	spin_unlock_bh(lgr_lock);
 	return 0;
 
 destroy_qp:
@@ -442,11 +460,15 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 
 void smc_lgr_forget(struct smc_link_group *lgr)
 {
-	spin_lock_bh(&smc_lgr_list.lock);
+	struct list_head *lgr_list;
+	spinlock_t *lgr_lock;
+
+	lgr_list = smc_lgr_list_head(lgr, &lgr_lock);
+	spin_lock_bh(lgr_lock);
 	/* do not use this link group for new connections */
-	if (!list_empty(&lgr->list))
-		list_del_init(&lgr->list);
-	spin_unlock_bh(&smc_lgr_list.lock);
+	if (!list_empty(lgr_list))
+		list_del_init(lgr_list);
+	spin_unlock_bh(lgr_lock);
 }
 
 /* terminate linkgroup abnormally */
@@ -487,9 +509,12 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
 
 void smc_lgr_terminate(struct smc_link_group *lgr)
 {
-	spin_lock_bh(&smc_lgr_list.lock);
+	spinlock_t *lgr_lock;
+
+	smc_lgr_list_head(lgr, &lgr_lock);
+	spin_lock_bh(lgr_lock);
 	__smc_lgr_terminate(lgr);
-	spin_unlock_bh(&smc_lgr_list.lock);
+	spin_unlock_bh(lgr_lock);
 }
 
 /* Called when IB port is terminated */
@@ -514,7 +539,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
 	LIST_HEAD(lgr_free_list);
 
 	/* run common cleanup function and build free list */
-	spin_lock_bh(&smc_lgr_list.lock);
+	spin_lock_bh(&dev->lgr_lock);
 	list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) {
 		if ((!peer_gid || lgr->peer_gid == peer_gid) &&
 		    (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) {
@@ -522,7 +547,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan)
 			list_move(&lgr->list, &lgr_free_list);
 		}
 	}
-	spin_unlock_bh(&smc_lgr_list.lock);
+	spin_unlock_bh(&dev->lgr_lock);
 
 	/* cancel the regular free workers and actually free lgrs */
 	list_for_each_entry_safe(lgr, l, &lgr_free_list, list) {
@@ -609,9 +634,11 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 	struct list_head *lgr_list;
 	struct smc_link_group *lgr;
 	enum smc_lgr_role role;
+	spinlock_t *lgr_lock;
 	int rc = 0;
 
 	lgr_list = ini->is_smcd ? &ini->ism_dev->lgr_list : &smc_lgr_list.list;
+	lgr_lock = ini->is_smcd ? &ini->ism_dev->lgr_lock : &smc_lgr_list.lock;
 	ini->cln_first_contact = SMC_FIRST_CONTACT;
 	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
 	if (role == SMC_CLNT && ini->srv_first_contact)
@@ -619,7 +646,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 		goto create;
 
 	/* determine if an existing link group can be reused */
-	spin_lock_bh(&smc_lgr_list.lock);
+	spin_lock_bh(lgr_lock);
 	list_for_each_entry(lgr, lgr_list, list) {
 		write_lock_bh(&lgr->conns_lock);
 		if ((ini->is_smcd ?
@@ -640,7 +667,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 		}
 		write_unlock_bh(&lgr->conns_lock);
 	}
-	spin_unlock_bh(&smc_lgr_list.lock);
+	spin_unlock_bh(lgr_lock);
 
 	if (role == SMC_CLNT && !ini->srv_first_contact &&
 	    ini->cln_first_contact == SMC_FIRST_CONTACT) {
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 674eb5ae2320..34dc619655e8 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -286,6 +286,7 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
 	smc_pnetid_by_dev_port(parent, 0, smcd->pnetid);
 
 	spin_lock_init(&smcd->lock);
+	spin_lock_init(&smcd->lgr_lock);
 	INIT_LIST_HEAD(&smcd->vlan);
 	INIT_LIST_HEAD(&smcd->lgr_list);
 	smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
-- 
cgit v1.2.3-59-g8ed1b


From c3d9494e68c4a5d23227ede822fda9bd68bef8e3 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Wed, 9 Oct 2019 10:07:46 +0200
Subject: net/smc: no new connections on disappearing devices

Add a "going_away" indication to ISM devices and IB ports and
avoid creation of new connections on such disappearing devices.

And do not handle ISM events if ISM device is disappearing.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/smc.h  |  1 +
 net/smc/smc_core.c | 23 +++++++++++++++++++++++
 net/smc/smc_ib.c   | 15 +++++++++++++--
 net/smc/smc_ib.h   |  1 +
 net/smc/smc_ism.c  |  3 +++
 net/smc/smc_pnet.c |  5 ++++-
 6 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/smc.h b/include/net/smc.h
index 438bb0261f45..05174ae4f325 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -77,6 +77,7 @@ struct smcd_dev {
 	bool pnetid_by_user;
 	struct list_head lgr_list;
 	spinlock_t lgr_lock;
+	u8 going_away : 1;
 };
 
 struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a07fbf56c929..5862784eedd4 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -1060,6 +1060,27 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn,
 	return 0;
 }
 
+static void smc_core_going_away(void)
+{
+	struct smc_ib_device *smcibdev;
+	struct smcd_dev *smcd;
+
+	spin_lock(&smc_ib_devices.lock);
+	list_for_each_entry(smcibdev, &smc_ib_devices.list, list) {
+		int i;
+
+		for (i = 0; i < SMC_MAX_PORTS; i++)
+			set_bit(i, smcibdev->ports_going_away);
+	}
+	spin_unlock(&smc_ib_devices.lock);
+
+	spin_lock(&smcd_dev_list.lock);
+	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
+		smcd->going_away = 1;
+	}
+	spin_unlock(&smcd_dev_list.lock);
+}
+
 /* Called (from smc_exit) when module is removed */
 void smc_core_exit(void)
 {
@@ -1067,6 +1088,8 @@ void smc_core_exit(void)
 	LIST_HEAD(lgr_freeing_list);
 	struct smcd_dev *smcd;
 
+	smc_core_going_away();
+
 	spin_lock_bh(&smc_lgr_list.lock);
 	list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
 	spin_unlock_bh(&smc_lgr_list.lock);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index d14ca4af6f94..af05daeb0538 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -242,8 +242,12 @@ static void smc_ib_port_event_work(struct work_struct *work)
 	for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
 		smc_ib_remember_port_attr(smcibdev, port_idx + 1);
 		clear_bit(port_idx, &smcibdev->port_event_mask);
-		if (!smc_ib_port_active(smcibdev, port_idx + 1))
+		if (!smc_ib_port_active(smcibdev, port_idx + 1)) {
+			set_bit(port_idx, smcibdev->ports_going_away);
 			smc_port_terminate(smcibdev, port_idx + 1);
+		} else {
+			clear_bit(port_idx, smcibdev->ports_going_away);
+		}
 	}
 }
 
@@ -259,8 +263,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
 	switch (ibevent->event) {
 	case IB_EVENT_DEVICE_FATAL:
 		/* terminate all ports on device */
-		for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++)
+		for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
 			set_bit(port_idx, &smcibdev->port_event_mask);
+			set_bit(port_idx, smcibdev->ports_going_away);
+		}
 		schedule_work(&smcibdev->port_event_work);
 		break;
 	case IB_EVENT_PORT_ERR:
@@ -269,6 +275,10 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
 		port_idx = ibevent->element.port_num - 1;
 		if (port_idx < SMC_MAX_PORTS) {
 			set_bit(port_idx, &smcibdev->port_event_mask);
+			if (ibevent->event == IB_EVENT_PORT_ERR)
+				set_bit(port_idx, smcibdev->ports_going_away);
+			else if (ibevent->event == IB_EVENT_PORT_ACTIVE)
+				clear_bit(port_idx, smcibdev->ports_going_away);
 			schedule_work(&smcibdev->port_event_work);
 		}
 		break;
@@ -307,6 +317,7 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
 		port_idx = ibevent->element.qp->port - 1;
 		if (port_idx < SMC_MAX_PORTS) {
 			set_bit(port_idx, &smcibdev->port_event_mask);
+			set_bit(port_idx, smcibdev->ports_going_away);
 			schedule_work(&smcibdev->port_event_work);
 		}
 		break;
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index da60ab9e8d70..6a0069db6cae 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -47,6 +47,7 @@ struct smc_ib_device {				/* ib-device infos for smc */
 	u8			initialized : 1; /* ib dev CQ, evthdl done */
 	struct work_struct	port_event_work;
 	unsigned long		port_event_mask;
+	DECLARE_BITMAP(ports_going_away, SMC_MAX_PORTS);
 };
 
 struct smc_buf_desc;
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 34dc619655e8..ee7340898cb4 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -315,6 +315,7 @@ void smcd_unregister_dev(struct smcd_dev *smcd)
 	spin_lock(&smcd_dev_list.lock);
 	list_del(&smcd->list);
 	spin_unlock(&smcd_dev_list.lock);
+	smcd->going_away = 1;
 	flush_workqueue(smcd->event_wq);
 	destroy_workqueue(smcd->event_wq);
 	smc_smcd_terminate(smcd, 0, VLAN_VID_MASK);
@@ -344,6 +345,8 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event)
 {
 	struct smc_ism_event_work *wrk;
 
+	if (smcd->going_away)
+		return;
 	/* copy event to event work queue, and let it be handled there */
 	wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC);
 	if (!wrk)
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index bab2da8cf17a..6b7799b3f5ca 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -781,6 +781,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev,
 			dev_put(ndev);
 			if (netdev == ndev &&
 			    smc_ib_port_active(ibdev, i) &&
+			    !test_bit(i - 1, ibdev->ports_going_away) &&
 			    !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
 						  ini->ib_gid, NULL)) {
 				ini->ib_dev = ibdev;
@@ -820,6 +821,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
 				continue;
 			if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) &&
 			    smc_ib_port_active(ibdev, i) &&
+			    !test_bit(i - 1, ibdev->ports_going_away) &&
 			    !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
 						  ini->ib_gid, NULL)) {
 				ini->ib_dev = ibdev;
@@ -846,7 +848,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
 
 	spin_lock(&smcd_dev_list.lock);
 	list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
-		if (smc_pnet_match(ismdev->pnetid, ndev_pnetid)) {
+		if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) &&
+		    !ismdev->going_away) {
 			ini->ism_dev = ismdev;
 			break;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 84a081f60db63aaae3665118203506aa09a7f94f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 11 Oct 2019 11:11:40 -0700
Subject: bpf: Align struct bpf_prog_stats

Do not risk spanning these small structures on two cache lines.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191011181140.2898-1-edumazet@google.com
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b9d22338606..282e28bf41ec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -363,7 +363,7 @@ struct bpf_prog_stats {
 	u64 cnt;
 	u64 nsecs;
 	struct u64_stats_sync syncp;
-};
+} __aligned(2 * sizeof(u64));
 
 struct bpf_prog_aux {
 	atomic_t refcnt;
-- 
cgit v1.2.3-59-g8ed1b


From e7a981050a7fb9a14b652365c00d9c5a025704ce Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 10 Oct 2019 15:18:49 +0200
Subject: devlink: propagate extack down to health reporter ops

During health reporter operations, driver might want to fill-up
the extack message, so propagate extack down to the health reporter ops.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c    |  9 ++++++---
 .../net/ethernet/mellanox/mlx5/core/en/reporter_rx.c |  6 ++++--
 .../net/ethernet/mellanox/mlx5/core/en/reporter_tx.c |  6 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/health.c     | 12 ++++++++----
 include/net/devlink.h                                |  8 +++++---
 net/core/devlink.c                                   | 20 +++++++++++---------
 6 files changed, 38 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index e664392dccc0..ff1bc0ec2e7c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -16,7 +16,8 @@
 #include "bnxt_devlink.h"
 
 static int bnxt_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
-				     struct devlink_fmsg *fmsg)
+				     struct devlink_fmsg *fmsg,
+				     struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = devlink_health_reporter_priv(reporter);
 	struct bnxt_fw_health *health = bp->fw_health;
@@ -66,7 +67,8 @@ static const struct devlink_health_reporter_ops bnxt_dl_fw_reporter_ops = {
 };
 
 static int bnxt_fw_reset_recover(struct devlink_health_reporter *reporter,
-				 void *priv_ctx)
+				 void *priv_ctx,
+				 struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = devlink_health_reporter_priv(reporter);
 
@@ -84,7 +86,8 @@ struct devlink_health_reporter_ops bnxt_dl_fw_reset_reporter_ops = {
 };
 
 static int bnxt_fw_fatal_recover(struct devlink_health_reporter *reporter,
-				 void *priv_ctx)
+				 void *priv_ctx,
+				 struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = devlink_health_reporter_priv(reporter);
 	struct bnxt_fw_reporter_ctx *fw_reporter_ctx = priv_ctx;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index b860569d4247..6c72b592315b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -222,7 +222,8 @@ static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
 }
 
 static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter,
-				     void *context)
+				     void *context,
+				     struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
 	struct mlx5e_err_ctx *err_ctx = context;
@@ -301,7 +302,8 @@ static int mlx5e_rx_reporter_build_diagnose_output(struct mlx5e_rq *rq,
 }
 
 static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter,
-				      struct devlink_fmsg *fmsg)
+				      struct devlink_fmsg *fmsg,
+				      struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
 	struct mlx5e_params *params = &priv->channels.params;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index bfed558637c2..b468549e96ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -135,7 +135,8 @@ static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
 }
 
 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
-				     void *context)
+				     void *context,
+				     struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
 	struct mlx5e_err_ctx *err_ctx = context;
@@ -205,7 +206,8 @@ mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
 }
 
 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
-				      struct devlink_fmsg *fmsg)
+				      struct devlink_fmsg *fmsg,
+				      struct netlink_ext_ack *extack)
 {
 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
 	struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index d685122d9ff7..be3c3c704bfc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -390,7 +390,8 @@ static void print_health_info(struct mlx5_core_dev *dev)
 
 static int
 mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
-			  struct devlink_fmsg *fmsg)
+			  struct devlink_fmsg *fmsg,
+			  struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
 	struct mlx5_core_health *health = &dev->priv.health;
@@ -491,7 +492,8 @@ mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev,
 
 static int
 mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter,
-		      struct devlink_fmsg *fmsg, void *priv_ctx)
+		      struct devlink_fmsg *fmsg, void *priv_ctx,
+		      struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
 	int err;
@@ -545,7 +547,8 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
 
 static int
 mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
-			       void *priv_ctx)
+			       void *priv_ctx,
+			       struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
 
@@ -555,7 +558,8 @@ mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
 #define MLX5_CR_DUMP_CHUNK_SIZE 256
 static int
 mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter,
-			    struct devlink_fmsg *fmsg, void *priv_ctx)
+			    struct devlink_fmsg *fmsg, void *priv_ctx,
+			    struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
 	u32 crdump_size = dev->priv.health.crdump_size;
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4095657fc23f..6bf3b9e0595a 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -507,11 +507,13 @@ enum devlink_health_reporter_state {
 struct devlink_health_reporter_ops {
 	char *name;
 	int (*recover)(struct devlink_health_reporter *reporter,
-		       void *priv_ctx);
+		       void *priv_ctx, struct netlink_ext_ack *extack);
 	int (*dump)(struct devlink_health_reporter *reporter,
-		    struct devlink_fmsg *fmsg, void *priv_ctx);
+		    struct devlink_fmsg *fmsg, void *priv_ctx,
+		    struct netlink_ext_ack *extack);
 	int (*diagnose)(struct devlink_health_reporter *reporter,
-			struct devlink_fmsg *fmsg);
+			struct devlink_fmsg *fmsg,
+			struct netlink_ext_ack *extack);
 };
 
 /**
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 95887462eecf..97e9a2246929 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4847,7 +4847,7 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update);
 
 static int
 devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
-				void *priv_ctx)
+				void *priv_ctx, struct netlink_ext_ack *extack)
 {
 	int err;
 
@@ -4857,7 +4857,7 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
 	if (!reporter->ops->recover)
 		return -EOPNOTSUPP;
 
-	err = reporter->ops->recover(reporter, priv_ctx);
+	err = reporter->ops->recover(reporter, priv_ctx, extack);
 	if (err)
 		return err;
 
@@ -4878,7 +4878,8 @@ devlink_health_dump_clear(struct devlink_health_reporter *reporter)
 }
 
 static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
-				  void *priv_ctx)
+				  void *priv_ctx,
+				  struct netlink_ext_ack *extack)
 {
 	int err;
 
@@ -4899,7 +4900,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
 		goto dump_err;
 
 	err = reporter->ops->dump(reporter, reporter->dump_fmsg,
-				  priv_ctx);
+				  priv_ctx, extack);
 	if (err)
 		goto dump_err;
 
@@ -4946,11 +4947,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
 
 	mutex_lock(&reporter->dump_lock);
 	/* store current dump of current error, for later analysis */
-	devlink_health_do_dump(reporter, priv_ctx);
+	devlink_health_do_dump(reporter, priv_ctx, NULL);
 	mutex_unlock(&reporter->dump_lock);
 
 	if (reporter->auto_recover)
-		return devlink_health_reporter_recover(reporter, priv_ctx);
+		return devlink_health_reporter_recover(reporter,
+						       priv_ctx, NULL);
 
 	return 0;
 }
@@ -5188,7 +5190,7 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb,
 	if (!reporter)
 		return -EINVAL;
 
-	err = devlink_health_reporter_recover(reporter, NULL);
+	err = devlink_health_reporter_recover(reporter, NULL, info->extack);
 
 	devlink_health_reporter_put(reporter);
 	return err;
@@ -5221,7 +5223,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
 	if (err)
 		goto out;
 
-	err = reporter->ops->diagnose(reporter, fmsg);
+	err = reporter->ops->diagnose(reporter, fmsg, info->extack);
 	if (err)
 		goto out;
 
@@ -5256,7 +5258,7 @@ devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb,
 	}
 	mutex_lock(&reporter->dump_lock);
 	if (!start) {
-		err = devlink_health_do_dump(reporter, NULL);
+		err = devlink_health_do_dump(reporter, NULL, cb->extack);
 		if (err)
 			goto unlock;
 		cb->args[1] = reporter->dump_ts;
-- 
cgit v1.2.3-59-g8ed1b


From 14af7fd1d4279c8db7fbbb3ca0df3b13179eb502 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sat, 12 Oct 2019 18:27:57 +0200
Subject: ethtool: Add support for 400Gbps (50Gbps per lane) link modes

Add support for 400Gbps speed, link modes of 50Gbps per lane

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c   | 10 +++++++++-
 include/uapi/linux/ethtool.h |  6 ++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 9412669b579c..4d96f7a8e8f2 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -8,7 +8,7 @@
 
 const char *phy_speed_to_str(int speed)
 {
-	BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 69,
+	BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 74,
 		"Enum ethtool_link_mode_bit_indices and phylib are out of sync. "
 		"If a speed or mode has been added please update phy_speed_to_str "
 		"and the PHY settings array.\n");
@@ -42,6 +42,8 @@ const char *phy_speed_to_str(int speed)
 		return "100Gbps";
 	case SPEED_200000:
 		return "200Gbps";
+	case SPEED_400000:
+		return "400Gbps";
 	case SPEED_UNKNOWN:
 		return "Unknown";
 	default:
@@ -70,6 +72,12 @@ EXPORT_SYMBOL_GPL(phy_duplex_to_str);
 			       .bit = ETHTOOL_LINK_MODE_ ## b ## _BIT}
 
 static const struct phy_setting settings[] = {
+	/* 400G */
+	PHY_SETTING( 400000, FULL, 400000baseCR8_Full		),
+	PHY_SETTING( 400000, FULL, 400000baseKR8_Full		),
+	PHY_SETTING( 400000, FULL, 400000baseLR8_ER8_FR8_Full	),
+	PHY_SETTING( 400000, FULL, 400000baseDR8_Full		),
+	PHY_SETTING( 400000, FULL, 400000baseSR8_Full		),
 	/* 200G */
 	PHY_SETTING( 200000, FULL, 200000baseCR4_Full		),
 	PHY_SETTING( 200000, FULL, 200000baseKR4_Full		),
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 8938b76c4ee3..d4591792f0b4 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1507,6 +1507,11 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT	 = 66,
 	ETHTOOL_LINK_MODE_100baseT1_Full_BIT		 = 67,
 	ETHTOOL_LINK_MODE_1000baseT1_Full_BIT		 = 68,
+	ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT	 = 69,
+	ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT	 = 70,
+	ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71,
+	ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT	 = 72,
+	ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT	 = 73,
 
 	/* must be last entry */
 	__ETHTOOL_LINK_MODE_MASK_NBITS
@@ -1618,6 +1623,7 @@ enum ethtool_link_mode_bit_indices {
 #define SPEED_56000		56000
 #define SPEED_100000		100000
 #define SPEED_200000		200000
+#define SPEED_400000		400000
 
 #define SPEED_UNKNOWN		-1
 
-- 
cgit v1.2.3-59-g8ed1b


From 554032cdfbf4491f38241a3f6b27459408d90df3 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 15 Oct 2019 11:28:46 +0100
Subject: net: phylink: use more linkmode_*

Use more linkmode_* helpers rather than open-coding the bitmap
operations.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 7 ++-----
 include/linux/linkmode.h  | 6 ++++++
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index a5a57ca94c1a..8e53ed90da3c 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -117,9 +117,7 @@ static int phylink_is_empty_linkmode(const unsigned long *linkmode)
 	phylink_set(tmp, Pause);
 	phylink_set(tmp, Asym_Pause);
 
-	bitmap_andnot(tmp, linkmode, tmp, __ETHTOOL_LINK_MODE_MASK_NBITS);
-
-	return linkmode_empty(tmp);
+	return linkmode_subset(linkmode, tmp);
 }
 
 static const char *phylink_an_mode_str(unsigned int mode)
@@ -1728,8 +1726,7 @@ static int phylink_sfp_module_insert(void *upstream,
 	if (phy_interface_mode_is_8023z(iface) && pl->phydev)
 		return -EINVAL;
 
-	changed = !bitmap_equal(pl->supported, support,
-				__ETHTOOL_LINK_MODE_MASK_NBITS);
+	changed = !linkmode_equal(pl->supported, support);
 	if (changed) {
 		linkmode_copy(pl->supported, support);
 		linkmode_copy(pl->link_config.advertising, config.advertising);
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index a99c58866860..fe740031339d 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -82,4 +82,10 @@ static inline int linkmode_equal(const unsigned long *src1,
 	return bitmap_equal(src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
+static inline int linkmode_subset(const unsigned long *src1,
+				  const unsigned long *src2)
+{
+	return bitmap_subset(src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
 #endif /* __LINKMODE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 2203cbf2c8b58a1e3bef98c47531d431d11639a0 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 15 Oct 2019 11:38:39 +0100
Subject: net: sfp: move fwnode parsing into sfp-bus layer

Rather than parsing the sfp firmware node in phylink, parse it in the
sfp-bus code, so we can re-use this code for PHYs without having to
duplicate the parsing.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 21 +++++----------
 drivers/net/phy/sfp-bus.c | 65 ++++++++++++++++++++++++++++++-----------------
 include/linux/sfp.h       | 10 ++++----
 3 files changed, 53 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 8e53ed90da3c..4c5e8b4f8d80 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -548,26 +548,17 @@ static const struct sfp_upstream_ops sfp_phylink_ops;
 static int phylink_register_sfp(struct phylink *pl,
 				struct fwnode_handle *fwnode)
 {
-	struct fwnode_reference_args ref;
+	struct sfp_bus *bus;
 	int ret;
 
-	if (!fwnode)
-		return 0;
-
-	ret = fwnode_property_get_reference_args(fwnode, "sfp", NULL,
-						 0, 0, &ref);
-	if (ret < 0) {
-		if (ret == -ENOENT)
-			return 0;
-
-		phylink_err(pl, "unable to parse \"sfp\" node: %d\n",
-			    ret);
+	bus = sfp_register_upstream_node(fwnode, pl, &sfp_phylink_ops);
+	if (IS_ERR(bus)) {
+		ret = PTR_ERR(bus);
+		phylink_err(pl, "unable to attach SFP bus: %d\n", ret);
 		return ret;
 	}
 
-	pl->sfp_bus = sfp_register_upstream(ref.fwnode, pl, &sfp_phylink_ops);
-	if (!pl->sfp_bus)
-		return -ENOMEM;
+	pl->sfp_bus = bus;
 
 	return 0;
 }
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index b23fc41896ef..d037aab6a71d 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -4,6 +4,7 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/phylink.h>
+#include <linux/property.h>
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
 
@@ -445,45 +446,63 @@ static void sfp_upstream_clear(struct sfp_bus *bus)
 }
 
 /**
- * sfp_register_upstream() - Register the neighbouring device
- * @fwnode: firmware node for the SFP bus
+ * sfp_register_upstream_node() - parse and register the neighbouring device
+ * @fwnode: firmware node for the parent device (MAC or PHY)
  * @upstream: the upstream private data
  * @ops: the upstream's &struct sfp_upstream_ops
  *
- * Register the upstream device (eg, PHY) with the SFP bus. MAC drivers
- * should use phylink, which will call this function for them. Returns
- * a pointer to the allocated &struct sfp_bus.
+ * Parse the parent device's firmware node for a SFP bus, and register the
+ * SFP bus using sfp_register_upstream().
  *
- * On error, returns %NULL.
+ * Returns: on success, a pointer to the sfp_bus structure,
+ *	    %NULL if no SFP is specified,
+ * 	    on failure, an error pointer value:
+ * 		corresponding to the errors detailed for
+ * 		fwnode_property_get_reference_args().
+ * 	        %-ENOMEM if we failed to allocate the bus.
+ *		an error from the upstream's connect_phy() method.
  */
-struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode,
-				      void *upstream,
-				      const struct sfp_upstream_ops *ops)
+struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode,
+					   void *upstream,
+					   const struct sfp_upstream_ops *ops)
 {
-	struct sfp_bus *bus = sfp_bus_get(fwnode);
-	int ret = 0;
+	struct fwnode_reference_args ref;
+	struct sfp_bus *bus;
+	int ret;
 
-	if (bus) {
-		rtnl_lock();
-		bus->upstream_ops = ops;
-		bus->upstream = upstream;
+	ret = fwnode_property_get_reference_args(fwnode, "sfp", NULL,
+						 0, 0, &ref);
+	if (ret == -ENOENT)
+		return NULL;
+	else if (ret < 0)
+		return ERR_PTR(ret);
 
-		if (bus->sfp) {
-			ret = sfp_register_bus(bus);
-			if (ret)
-				sfp_upstream_clear(bus);
-		}
-		rtnl_unlock();
+	bus = sfp_bus_get(ref.fwnode);
+	fwnode_handle_put(ref.fwnode);
+	if (!bus)
+		return ERR_PTR(-ENOMEM);
+
+	rtnl_lock();
+	bus->upstream_ops = ops;
+	bus->upstream = upstream;
+
+	if (bus->sfp) {
+		ret = sfp_register_bus(bus);
+		if (ret)
+			sfp_upstream_clear(bus);
+	} else {
+		ret = 0;
 	}
+	rtnl_unlock();
 
 	if (ret) {
 		sfp_bus_put(bus);
-		bus = NULL;
+		bus = ERR_PTR(ret);
 	}
 
 	return bus;
 }
-EXPORT_SYMBOL_GPL(sfp_register_upstream);
+EXPORT_SYMBOL_GPL(sfp_register_upstream_node);
 
 /**
  * sfp_unregister_upstream() - Unregister sfp bus
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 1c35428e98bc..355a08a76fd4 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -508,9 +508,9 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee,
 			  u8 *data);
 void sfp_upstream_start(struct sfp_bus *bus);
 void sfp_upstream_stop(struct sfp_bus *bus);
-struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode,
-				      void *upstream,
-				      const struct sfp_upstream_ops *ops);
+struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode,
+					   void *upstream,
+					   const struct sfp_upstream_ops *ops);
 void sfp_unregister_upstream(struct sfp_bus *bus);
 #else
 static inline int sfp_parse_port(struct sfp_bus *bus,
@@ -553,11 +553,11 @@ static inline void sfp_upstream_stop(struct sfp_bus *bus)
 {
 }
 
-static inline struct sfp_bus *sfp_register_upstream(
+static inline struct sfp_bus *sfp_register_upstream_node(
 	struct fwnode_handle *fwnode, void *upstream,
 	const struct sfp_upstream_ops *ops)
 {
-	return (struct sfp_bus *)-1;
+	return NULL;
 }
 
 static inline void sfp_unregister_upstream(struct sfp_bus *bus)
-- 
cgit v1.2.3-59-g8ed1b


From 2ad9d7747c10d17cc06447944fefd4c29ae11eb1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Oct 2019 15:19:15 +0200
Subject: netfilter: conntrack: free extension area immediately

Instead of waiting for rcu grace period just free it directly.

This is safe because conntrack lookup doesn't consider extensions.

Other accesses happen while ct->ext can't be free'd, either because
a ct refcount was taken or because the conntrack hash bucket lock or
the dying list spinlock have been taken.

This allows to remove __krealloc in a followup patch, netfilter was the
only user.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_extend.h | 10 ----------
 net/netfilter/nf_conntrack_core.c           |  2 --
 net/netfilter/nf_conntrack_extend.c         | 21 ++++++++++-----------
 3 files changed, 10 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 112a6f40dfaf..5ae5295aa46d 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -43,7 +43,6 @@ enum nf_ct_ext_id {
 
 /* Extensions: optional stuff which isn't permanently in struct. */
 struct nf_ct_ext {
-	struct rcu_head rcu;
 	u8 offset[NF_CT_EXT_NUM];
 	u8 len;
 	char data[0];
@@ -72,15 +71,6 @@ static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id)
 /* Destroy all relationships */
 void nf_ct_ext_destroy(struct nf_conn *ct);
 
-/* Free operation. If you want to free a object referred from private area,
- * please implement __nf_ct_ext_free() and call it.
- */
-static inline void nf_ct_ext_free(struct nf_conn *ct)
-{
-	if (ct->ext)
-		kfree_rcu(ct->ext, rcu);
-}
-
 /* Add this type, returns pointer to data or NULL. */
 void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0c63120b2db2..bcccaa7ec34c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -573,7 +573,6 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
 void nf_ct_tmpl_free(struct nf_conn *tmpl)
 {
 	nf_ct_ext_destroy(tmpl);
-	nf_ct_ext_free(tmpl);
 
 	if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
 		kfree((char *)tmpl - tmpl->proto.tmpl_padto);
@@ -1417,7 +1416,6 @@ void nf_conntrack_free(struct nf_conn *ct)
 	WARN_ON(atomic_read(&ct->ct_general.use) != 0);
 
 	nf_ct_ext_destroy(ct);
-	nf_ct_ext_free(ct);
 	kmem_cache_free(nf_conntrack_cachep, ct);
 	smp_mb__before_atomic();
 	atomic_dec(&net->ct.count);
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index d4ed1e197921..c24e5b64b00c 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -34,21 +34,24 @@ void nf_ct_ext_destroy(struct nf_conn *ct)
 			t->destroy(ct);
 		rcu_read_unlock();
 	}
+
+	kfree(ct->ext);
 }
 EXPORT_SYMBOL(nf_ct_ext_destroy);
 
 void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
 {
 	unsigned int newlen, newoff, oldlen, alloc;
-	struct nf_ct_ext *old, *new;
 	struct nf_ct_ext_type *t;
+	struct nf_ct_ext *new;
 
 	/* Conntrack must not be confirmed to avoid races on reallocation. */
 	WARN_ON(nf_ct_is_confirmed(ct));
 
-	old = ct->ext;
 
-	if (old) {
+	if (ct->ext) {
+		const struct nf_ct_ext *old = ct->ext;
+
 		if (__nf_ct_ext_exist(old, id))
 			return NULL;
 		oldlen = old->len;
@@ -68,22 +71,18 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
 	rcu_read_unlock();
 
 	alloc = max(newlen, NF_CT_EXT_PREALLOC);
-	kmemleak_not_leak(old);
-	new = __krealloc(old, alloc, gfp);
+	new = krealloc(ct->ext, alloc, gfp);
 	if (!new)
 		return NULL;
 
-	if (!old) {
+	if (!ct->ext)
 		memset(new->offset, 0, sizeof(new->offset));
-		ct->ext = new;
-	} else if (new != old) {
-		kfree_rcu(old, rcu);
-		rcu_assign_pointer(ct->ext, new);
-	}
 
 	new->offset[id] = newoff;
 	new->len = newlen;
 	memset((void *)new + newoff, 0, newlen - newoff);
+
+	ct->ext = new;
 	return (void *)new + newoff;
 }
 EXPORT_SYMBOL(nf_ct_ext_add);
-- 
cgit v1.2.3-59-g8ed1b


From ca58fbe06c54795f00db79e447f94c2028d30124 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 11 Oct 2019 00:30:37 +0200
Subject: netfilter: add and use nf_hook_slow_list()

At this time, NF_HOOK_LIST() macro will iterate the list and then calls
nf_hook() for each individual skb.

This makes it so the entire list is passed into the netfilter core.
The advantage is that we only need to fetch the rule blob once per list
instead of per-skb.

NF_HOOK_LIST now only works for ipv4 and ipv6, as those are the only
callers.

v2: use skb_list_del_init() instead of list_del (Edward Cree)

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 41 +++++++++++++++++++++++++++++++----------
 net/netfilter/core.c      | 20 ++++++++++++++++++++
 2 files changed, 51 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 77ebb61faf48..eb312e7ca36e 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -199,6 +199,8 @@ extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 		 const struct nf_hook_entries *e, unsigned int i);
 
+void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
+		       const struct nf_hook_entries *e);
 /**
  *	nf_hook - call a netfilter hook
  *
@@ -311,17 +313,36 @@ NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	     struct list_head *head, struct net_device *in, struct net_device *out,
 	     int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
-	struct sk_buff *skb, *next;
-	struct list_head sublist;
-
-	INIT_LIST_HEAD(&sublist);
-	list_for_each_entry_safe(skb, next, head, list) {
-		list_del(&skb->list);
-		if (nf_hook(pf, hook, net, sk, skb, in, out, okfn) == 1)
-			list_add_tail(&skb->list, &sublist);
+	struct nf_hook_entries *hook_head = NULL;
+
+#ifdef CONFIG_JUMP_LABEL
+	if (__builtin_constant_p(pf) &&
+	    __builtin_constant_p(hook) &&
+	    !static_key_false(&nf_hooks_needed[pf][hook]))
+		return;
+#endif
+
+	rcu_read_lock();
+	switch (pf) {
+	case NFPROTO_IPV4:
+		hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
+		break;
+	case NFPROTO_IPV6:
+		hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
 	}
-	/* Put passed packets back on main list */
-	list_splice(&sublist, head);
+
+	if (hook_head) {
+		struct nf_hook_state state;
+
+		nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn);
+
+		nf_hook_slow_list(head, &state, hook_head);
+	}
+	rcu_read_unlock();
 }
 
 /* Call setsockopt() */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 5d5bdf450091..78f046ec506f 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -536,6 +536,26 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 }
 EXPORT_SYMBOL(nf_hook_slow);
 
+void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
+		       const struct nf_hook_entries *e)
+{
+	struct sk_buff *skb, *next;
+	struct list_head sublist;
+	int ret;
+
+	INIT_LIST_HEAD(&sublist);
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		skb_list_del_init(skb);
+		ret = nf_hook_slow(skb, state, e, 0);
+		if (ret == 1)
+			list_add_tail(&skb->list, &sublist);
+	}
+	/* Put passed packets back on main list */
+	list_splice(&sublist, head);
+}
+EXPORT_SYMBOL(nf_hook_slow_list);
+
 /* This needs to be compiled in any case to avoid dependencies between the
  * nfnetlink_queue code and nf_conntrack.
  */
-- 
cgit v1.2.3-59-g8ed1b


From e8c423fb31fa8b1ef6d7cd14a168de33e7c0d702 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 Oct 2019 20:24:55 -0700
Subject: bpf: Add typecast to raw_tracepoints to help BTF generation

When pahole converts dwarf to btf it emits only used types.
Wrap existing __bpf_trace_##template() function into
btf_trace_##template typedef and use it in type cast to
make gcc emits this type into dwarf. Then pahole will convert it to btf.
The "btf_trace_" prefix will be used to identify BTF enabled raw tracepoints.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-2-ast@kernel.org
---
 include/trace/bpf_probe.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index d6e556c0a085..b04c29270973 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -74,11 +74,12 @@ static inline void bpf_test_probe_##call(void)				\
 {									\
 	check_trace_callback_type_##call(__bpf_trace_##template);	\
 }									\
+typedef void (*btf_trace_##call)(void *__data, proto);			\
 static struct bpf_raw_event_map	__used					\
 	__attribute__((section("__bpf_raw_tp_map")))			\
 __bpf_trace_tp_map_##call = {						\
 	.tp		= &__tracepoint_##call,				\
-	.bpf_func	= (void *)__bpf_trace_##template,		\
+	.bpf_func	= (void *)(btf_trace_##call)__bpf_trace_##template,	\
 	.num_args	= COUNT_ARGS(args),				\
 	.writable_size	= size,						\
 };
-- 
cgit v1.2.3-59-g8ed1b


From 7c6a469e3416fa23568c2395a3faa7dd6e376dcb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 Oct 2019 20:24:56 -0700
Subject: bpf: Add typecast to bpf helpers to help BTF generation

When pahole converts dwarf to btf it emits only used types.
Wrap existing bpf helper functions into typedef and use it in
typecast to make gcc emits this type into dwarf.
Then pahole will convert it to btf.
The "btf_#name_of_helper" types will be used to figure out
types of arguments of bpf helpers.
The generated code before and after is the same.
Only dwarf and btf sections are different.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-3-ast@kernel.org
---
 include/linux/filter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2ce57645f3cd..d3d51d7aff2c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -464,10 +464,11 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 #define BPF_CALL_x(x, name, ...)					       \
 	static __always_inline						       \
 	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
+	typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
 	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));	       \
 	u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))	       \
 	{								       \
-		return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
+		return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
 	}								       \
 	static __always_inline						       \
 	u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))
-- 
cgit v1.2.3-59-g8ed1b


From 8580ac9404f6240668a026785d7d8856f0530409 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 Oct 2019 20:24:57 -0700
Subject: bpf: Process in-kernel BTF

If in-kernel BTF exists parse it and prepare 'struct btf *btf_vmlinux'
for further use by the verifier.
In-kernel BTF is trusted just like kallsyms and other build artifacts
embedded into vmlinux.
Yet run this BTF image through BTF verifier to make sure
that it is valid and it wasn't mangled during the build.
If in-kernel BTF is incorrect it means either gcc or pahole or kernel
are buggy. In such case disallow loading BPF programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-4-ast@kernel.org
---
 include/linux/bpf_verifier.h |  4 ++-
 include/linux/btf.h          |  1 +
 kernel/bpf/btf.c             | 71 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c        | 20 +++++++++++++
 4 files changed, 94 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 26a6d58ca78c..713efae62e96 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -330,10 +330,12 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 #define BPF_LOG_STATS	4
 #define BPF_LOG_LEVEL	(BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
 #define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS)
+#define BPF_LOG_KERNEL	(BPF_LOG_MASK + 1) /* kernel internal flag */
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
-	return log->level && log->ubuf && !bpf_verifier_log_full(log);
+	return (log->level && log->ubuf && !bpf_verifier_log_full(log)) ||
+		log->level == BPF_LOG_KERNEL;
 }
 
 #define BPF_MAX_SUBPROGS 256
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 64cdf2a23d42..55d43bc856be 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -56,6 +56,7 @@ bool btf_type_is_void(const struct btf_type *t);
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
+struct btf *btf_parse_vmlinux(void);
 #else
 static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
 						    u32 type_id)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 29c7c06c6bd6..ddeab1e8d21e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -698,6 +698,13 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
 	if (!bpf_verifier_log_needed(log))
 		return;
 
+	/* btf verifier prints all types it is processing via
+	 * btf_verifier_log_type(..., fmt = NULL).
+	 * Skip those prints for in-kernel BTF verification.
+	 */
+	if (log->level == BPF_LOG_KERNEL && !fmt)
+		return;
+
 	__btf_verifier_log(log, "[%u] %s %s%s",
 			   env->log_type_id,
 			   btf_kind_str[kind],
@@ -735,6 +742,8 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	if (!bpf_verifier_log_needed(log))
 		return;
 
+	if (log->level == BPF_LOG_KERNEL && !fmt)
+		return;
 	/* The CHECK_META phase already did a btf dump.
 	 *
 	 * If member is logged again, it must hit an error in
@@ -777,6 +786,8 @@ static void btf_verifier_log_vsi(struct btf_verifier_env *env,
 
 	if (!bpf_verifier_log_needed(log))
 		return;
+	if (log->level == BPF_LOG_KERNEL && !fmt)
+		return;
 	if (env->phase != CHECK_META)
 		btf_verifier_log_type(env, datasec_type, NULL);
 
@@ -802,6 +813,8 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env,
 	if (!bpf_verifier_log_needed(log))
 		return;
 
+	if (log->level == BPF_LOG_KERNEL)
+		return;
 	hdr = &btf->hdr;
 	__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
 	__btf_verifier_log(log, "version: %u\n", hdr->version);
@@ -2405,7 +2418,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 			return -EINVAL;
 		}
 
-
+		if (env->log.level == BPF_LOG_KERNEL)
+			continue;
 		btf_verifier_log(env, "\t%s val=%d\n",
 				 __btf_name_by_offset(btf, enums[i].name_off),
 				 enums[i].val);
@@ -3367,6 +3381,61 @@ errout:
 	return ERR_PTR(err);
 }
 
+extern char __weak _binary__btf_vmlinux_bin_start[];
+extern char __weak _binary__btf_vmlinux_bin_end[];
+
+struct btf *btf_parse_vmlinux(void)
+{
+	struct btf_verifier_env *env = NULL;
+	struct bpf_verifier_log *log;
+	struct btf *btf = NULL;
+	int err;
+
+	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+	if (!env)
+		return ERR_PTR(-ENOMEM);
+
+	log = &env->log;
+	log->level = BPF_LOG_KERNEL;
+
+	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+	if (!btf) {
+		err = -ENOMEM;
+		goto errout;
+	}
+	env->btf = btf;
+
+	btf->data = _binary__btf_vmlinux_bin_start;
+	btf->data_size = _binary__btf_vmlinux_bin_end -
+		_binary__btf_vmlinux_bin_start;
+
+	err = btf_parse_hdr(env);
+	if (err)
+		goto errout;
+
+	btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+	err = btf_parse_str_sec(env);
+	if (err)
+		goto errout;
+
+	err = btf_check_all_metas(env);
+	if (err)
+		goto errout;
+
+	btf_verifier_env_free(env);
+	refcount_set(&btf->refcnt, 1);
+	return btf;
+
+errout:
+	btf_verifier_env_free(env);
+	if (btf) {
+		kvfree(btf->types);
+		kfree(btf);
+	}
+	return ERR_PTR(err);
+}
+
 void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d3446f018b9a..466b3b19de4d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -207,6 +207,8 @@ struct bpf_call_arg_meta {
 	int func_id;
 };
 
+struct btf *btf_vmlinux;
+
 static DEFINE_MUTEX(bpf_verifier_lock);
 
 static const struct bpf_line_info *
@@ -243,6 +245,10 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 	n = min(log->len_total - log->len_used - 1, n);
 	log->kbuf[n] = '\0';
 
+	if (log->level == BPF_LOG_KERNEL) {
+		pr_err("BPF:%s\n", log->kbuf);
+		return;
+	}
 	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
 		log->len_used += n;
 	else
@@ -9294,6 +9300,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	env->ops = bpf_verifier_ops[env->prog->type];
 	is_priv = capable(CAP_SYS_ADMIN);
 
+	if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+		mutex_lock(&bpf_verifier_lock);
+		if (!btf_vmlinux)
+			btf_vmlinux = btf_parse_vmlinux();
+		mutex_unlock(&bpf_verifier_lock);
+	}
+
 	/* grab the mutex to protect few globals used by verifier */
 	if (!is_priv)
 		mutex_lock(&bpf_verifier_lock);
@@ -9313,6 +9326,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 			goto err_unlock;
 	}
 
+	if (IS_ERR(btf_vmlinux)) {
+		/* Either gcc or pahole or kernel are broken. */
+		verbose(env, "in-kernel BTF is malformed\n");
+		ret = PTR_ERR(btf_vmlinux);
+		goto err_unlock;
+	}
+
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
-- 
cgit v1.2.3-59-g8ed1b


From ccfe29eb29c2edcea6552072ef00ff4117f53e83 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 Oct 2019 20:24:58 -0700
Subject: bpf: Add attach_btf_id attribute to program load

Add attach_btf_id attribute to prog_load command.
It's similar to existing expected_attach_type attribute which is
used in several cgroup based program types.
Unfortunately expected_attach_type is ignored for
tracing programs and cannot be reused for new purpose.
Hence introduce attach_btf_id to verify bpf programs against
given in-kernel BTF type id at load time.
It is strictly checked to be valid for raw_tp programs only.
In a later patches it will become:
btf_id == 0 semantics of existing raw_tp progs.
btd_id > 0 raw_tp with BTF and additional type safety.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-5-ast@kernel.org
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           | 18 ++++++++++++++----
 tools/include/uapi/linux/bpf.h |  1 +
 4 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 282e28bf41ec..f916380675dd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -375,6 +375,7 @@ struct bpf_prog_aux {
 	u32 id;
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
+	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
 	bool offload_requested;
 	struct bpf_prog **func;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a65c3b0c6935..3bb2cd1de341 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -420,6 +420,7 @@ union bpf_attr {
 		__u32		line_info_rec_size;	/* userspace bpf_line_info size */
 		__aligned_u64	line_info;	/* line info */
 		__u32		line_info_cnt;	/* number of bpf_line_info records */
+		__u32		attach_btf_id;	/* in-kernel BTF type id to attach to */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 82eabd4e38ad..b56c482c9760 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,7 @@
 #include <linux/timekeeping.h>
 #include <linux/ctype.h>
 #include <linux/nospec.h>
+#include <uapi/linux/btf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -1565,8 +1566,9 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
 }
 
 static int
-bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
-				enum bpf_attach_type expected_attach_type)
+bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
+			   enum bpf_attach_type expected_attach_type,
+			   u32 btf_id)
 {
 	switch (prog_type) {
 	case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -1608,13 +1610,19 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		default:
 			return -EINVAL;
 		}
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+		if (btf_id > BTF_MAX_TYPE)
+			return -EINVAL;
+		return 0;
 	default:
+		if (btf_id)
+			return -EINVAL;
 		return 0;
 	}
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD line_info_cnt
+#define	BPF_PROG_LOAD_LAST_FIELD attach_btf_id
 
 static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 {
@@ -1656,7 +1664,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 		return -EPERM;
 
 	bpf_prog_load_fixup_attach_type(attr);
-	if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type))
+	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
+				       attr->attach_btf_id))
 		return -EINVAL;
 
 	/* plain bpf_prog allocation */
@@ -1665,6 +1674,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 		return -ENOMEM;
 
 	prog->expected_attach_type = attr->expected_attach_type;
+	prog->aux->attach_btf_id = attr->attach_btf_id;
 
 	prog->aux->offload_requested = !!attr->prog_ifindex;
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a65c3b0c6935..3bb2cd1de341 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -420,6 +420,7 @@ union bpf_attr {
 		__u32		line_info_rec_size;	/* userspace bpf_line_info size */
 		__aligned_u64	line_info;	/* line info */
 		__u32		line_info_cnt;	/* number of bpf_line_info records */
+		__u32		attach_btf_id;	/* in-kernel BTF type id to attach to */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
-- 
cgit v1.2.3-59-g8ed1b


From 9e15db66136a14cde3f35691f1d839d950118826 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 Oct 2019 20:25:00 -0700
Subject: bpf: Implement accurate raw_tp context access via BTF

libbpf analyzes bpf C program, searches in-kernel BTF for given type name
and stores it into expected_attach_type.
The kernel verifier expects this btf_id to point to something like:
typedef void (*btf_trace_kfree_skb)(void *, struct sk_buff *skb, void *loc);
which represents signature of raw_tracepoint "kfree_skb".

Then btf_ctx_access() matches ctx+0 access in bpf program with 'skb'
and 'ctx+8' access with 'loc' arguments of "kfree_skb" tracepoint.
In first case it passes btf_id of 'struct sk_buff *' back to the verifier core
and 'void *' in second case.

Then the verifier tracks PTR_TO_BTF_ID as any other pointer type.
Like PTR_TO_SOCKET points to 'struct bpf_sock',
PTR_TO_TCP_SOCK points to 'struct bpf_tcp_sock', and so on.
PTR_TO_BTF_ID points to in-kernel structs.
If 1234 is btf_id of 'struct sk_buff' in vmlinux's BTF
then PTR_TO_BTF_ID#1234 points to one of in kernel skbs.

When PTR_TO_BTF_ID#1234 is dereferenced (like r2 = *(u64 *)r1 + 32)
the btf_struct_access() checks which field of 'struct sk_buff' is
at offset 32. Checks that size of access matches type definition
of the field and continues to track the dereferenced type.
If that field was a pointer to 'struct net_device' the r2's type
will be PTR_TO_BTF_ID#456. Where 456 is btf_id of 'struct net_device'
in vmlinux's BTF.

Such verifier analysis prevents "cheating" in BPF C program.
The program cannot cast arbitrary pointer to 'struct sk_buff *'
and access it. C compiler would allow type cast, of course,
but the verifier will notice type mismatch based on BPF assembly
and in-kernel BTF.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-7-ast@kernel.org
---
 include/linux/bpf.h          |  17 +++-
 include/linux/bpf_verifier.h |   4 +
 kernel/bpf/btf.c             | 190 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        |  88 +++++++++++++++++++-
 kernel/trace/bpf_trace.c     |   2 +-
 5 files changed, 296 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f916380675dd..028555fcd10d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -16,6 +16,7 @@
 #include <linux/u64_stats_sync.h>
 
 struct bpf_verifier_env;
+struct bpf_verifier_log;
 struct perf_event;
 struct bpf_prog;
 struct bpf_map;
@@ -281,6 +282,7 @@ enum bpf_reg_type {
 	PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
 	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
 	PTR_TO_XDP_SOCK,	 /* reg points to struct xdp_sock */
+	PTR_TO_BTF_ID,		 /* reg points to kernel struct */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -288,7 +290,11 @@ enum bpf_reg_type {
  */
 struct bpf_insn_access_aux {
 	enum bpf_reg_type reg_type;
-	int ctx_field_size;
+	union {
+		int ctx_field_size;
+		u32 btf_id;
+	};
+	struct bpf_verifier_log *log; /* for verbose logs */
 };
 
 static inline void
@@ -483,6 +489,7 @@ struct bpf_event_entry {
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 int bpf_prog_calc_tag(struct bpf_prog *fp);
+const char *kernel_type_name(u32 btf_type_id);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
@@ -748,6 +755,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 				     const union bpf_attr *kattr,
 				     union bpf_attr __user *uattr);
+bool btf_ctx_access(int off, int size, enum bpf_access_type type,
+		    const struct bpf_prog *prog,
+		    struct bpf_insn_access_aux *info);
+int btf_struct_access(struct bpf_verifier_log *log,
+		      const struct btf_type *t, int off, int size,
+		      enum bpf_access_type atype,
+		      u32 *next_btf_id);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 713efae62e96..6e7284ea1468 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -52,6 +52,8 @@ struct bpf_reg_state {
 		 */
 		struct bpf_map *map_ptr;
 
+		u32 btf_id; /* for PTR_TO_BTF_ID */
+
 		/* Max size from any of the above. */
 		unsigned long raw;
 	};
@@ -399,6 +401,8 @@ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+			    const char *fmt, ...);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ddeab1e8d21e..271d27cd427f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3436,6 +3436,196 @@ errout:
 	return ERR_PTR(err);
 }
 
+extern struct btf *btf_vmlinux;
+
+bool btf_ctx_access(int off, int size, enum bpf_access_type type,
+		    const struct bpf_prog *prog,
+		    struct bpf_insn_access_aux *info)
+{
+	struct bpf_verifier_log *log = info->log;
+	u32 btf_id = prog->aux->attach_btf_id;
+	const struct btf_param *args;
+	const struct btf_type *t;
+	const char prefix[] = "btf_trace_";
+	const char *tname;
+	u32 nr_args, arg;
+
+	if (!btf_id)
+		return true;
+
+	if (IS_ERR(btf_vmlinux)) {
+		bpf_log(log, "btf_vmlinux is malformed\n");
+		return false;
+	}
+
+	t = btf_type_by_id(btf_vmlinux, btf_id);
+	if (!t || BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
+		bpf_log(log, "btf_id is invalid\n");
+		return false;
+	}
+
+	tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
+	if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
+		bpf_log(log, "btf_id points to wrong type name %s\n", tname);
+		return false;
+	}
+	tname += sizeof(prefix) - 1;
+
+	t = btf_type_by_id(btf_vmlinux, t->type);
+	if (!btf_type_is_ptr(t))
+		return false;
+	t = btf_type_by_id(btf_vmlinux, t->type);
+	if (!btf_type_is_func_proto(t))
+		return false;
+
+	if (off % 8) {
+		bpf_log(log, "raw_tp '%s' offset %d is not multiple of 8\n",
+			tname, off);
+		return false;
+	}
+	arg = off / 8;
+	args = (const struct btf_param *)(t + 1);
+	/* skip first 'void *__data' argument in btf_trace_##name typedef */
+	args++;
+	nr_args = btf_type_vlen(t) - 1;
+	if (arg >= nr_args) {
+		bpf_log(log, "raw_tp '%s' doesn't have %d-th argument\n",
+			tname, arg);
+		return false;
+	}
+
+	t = btf_type_by_id(btf_vmlinux, args[arg].type);
+	/* skip modifiers */
+	while (btf_type_is_modifier(t))
+		t = btf_type_by_id(btf_vmlinux, t->type);
+	if (btf_type_is_int(t))
+		/* accessing a scalar */
+		return true;
+	if (!btf_type_is_ptr(t)) {
+		bpf_log(log,
+			"raw_tp '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
+			tname, arg,
+			__btf_name_by_offset(btf_vmlinux, t->name_off),
+			btf_kind_str[BTF_INFO_KIND(t->info)]);
+		return false;
+	}
+	if (t->type == 0)
+		/* This is a pointer to void.
+		 * It is the same as scalar from the verifier safety pov.
+		 * No further pointer walking is allowed.
+		 */
+		return true;
+
+	/* this is a pointer to another type */
+	info->reg_type = PTR_TO_BTF_ID;
+	info->btf_id = t->type;
+
+	t = btf_type_by_id(btf_vmlinux, t->type);
+	/* skip modifiers */
+	while (btf_type_is_modifier(t))
+		t = btf_type_by_id(btf_vmlinux, t->type);
+	if (!btf_type_is_struct(t)) {
+		bpf_log(log,
+			"raw_tp '%s' arg%d type %s is not a struct\n",
+			tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
+		return false;
+	}
+	bpf_log(log, "raw_tp '%s' arg%d has btf_id %d type %s '%s'\n",
+		tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
+		__btf_name_by_offset(btf_vmlinux, t->name_off));
+	return true;
+}
+
+int btf_struct_access(struct bpf_verifier_log *log,
+		      const struct btf_type *t, int off, int size,
+		      enum bpf_access_type atype,
+		      u32 *next_btf_id)
+{
+	const struct btf_member *member;
+	const struct btf_type *mtype;
+	const char *tname, *mname;
+	int i, moff = 0, msize;
+
+again:
+	tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
+	if (!btf_type_is_struct(t)) {
+		bpf_log(log, "Type '%s' is not a struct", tname);
+		return -EINVAL;
+	}
+
+	for_each_member(i, t, member) {
+		/* offset of the field in bits */
+		moff = btf_member_bit_offset(t, member);
+
+		if (btf_member_bitfield_size(t, member))
+			/* bitfields are not supported yet */
+			continue;
+
+		if (off + size <= moff / 8)
+			/* won't find anything, field is already too far */
+			break;
+
+		/* type of the field */
+		mtype = btf_type_by_id(btf_vmlinux, member->type);
+		mname = __btf_name_by_offset(btf_vmlinux, member->name_off);
+
+		/* skip modifiers */
+		while (btf_type_is_modifier(mtype))
+			mtype = btf_type_by_id(btf_vmlinux, mtype->type);
+
+		if (btf_type_is_array(mtype))
+			/* array deref is not supported yet */
+			continue;
+
+		if (!btf_type_has_size(mtype) && !btf_type_is_ptr(mtype)) {
+			bpf_log(log, "field %s doesn't have size\n", mname);
+			return -EFAULT;
+		}
+		if (btf_type_is_ptr(mtype))
+			msize = 8;
+		else
+			msize = mtype->size;
+		if (off >= moff / 8 + msize)
+			/* no overlap with member, keep iterating */
+			continue;
+		/* the 'off' we're looking for is either equal to start
+		 * of this field or inside of this struct
+		 */
+		if (btf_type_is_struct(mtype)) {
+			/* our field must be inside that union or struct */
+			t = mtype;
+
+			/* adjust offset we're looking for */
+			off -= moff / 8;
+			goto again;
+		}
+		if (msize != size) {
+			/* field access size doesn't match */
+			bpf_log(log,
+				"cannot access %d bytes in struct %s field %s that has size %d\n",
+				size, tname, mname, msize);
+			return -EACCES;
+		}
+
+		if (btf_type_is_ptr(mtype)) {
+			const struct btf_type *stype;
+
+			stype = btf_type_by_id(btf_vmlinux, mtype->type);
+			/* skip modifiers */
+			while (btf_type_is_modifier(stype))
+				stype = btf_type_by_id(btf_vmlinux, stype->type);
+			if (btf_type_is_struct(stype)) {
+				*next_btf_id = mtype->type;
+				return PTR_TO_BTF_ID;
+			}
+		}
+		/* all other fields are treated as scalars */
+		return SCALAR_VALUE;
+	}
+	bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off);
+	return -EINVAL;
+}
+
 void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 466b3b19de4d..42a463e09761 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -286,6 +286,19 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 	va_end(args);
 }
 
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+			    const char *fmt, ...)
+{
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+
 static const char *ltrim(const char *s)
 {
 	while (isspace(*s))
@@ -406,6 +419,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
 	[PTR_TO_TP_BUFFER]	= "tp_buffer",
 	[PTR_TO_XDP_SOCK]	= "xdp_sock",
+	[PTR_TO_BTF_ID]		= "ptr_",
 };
 
 static char slot_type_char[] = {
@@ -436,6 +450,12 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env,
 	return cur->frame[reg->frameno];
 }
 
+const char *kernel_type_name(u32 id)
+{
+	return btf_name_by_offset(btf_vmlinux,
+				  btf_type_by_id(btf_vmlinux, id)->name_off);
+}
+
 static void print_verifier_state(struct bpf_verifier_env *env,
 				 const struct bpf_func_state *state)
 {
@@ -460,6 +480,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			/* reg->off should be 0 for SCALAR_VALUE */
 			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
+			if (t == PTR_TO_BTF_ID)
+				verbose(env, "%s", kernel_type_name(reg->btf_id));
 			verbose(env, "(id=%d", reg->id);
 			if (reg_type_may_be_refcounted_or_null(t))
 				verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
@@ -2337,10 +2359,12 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 
 /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
-			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
+			    enum bpf_access_type t, enum bpf_reg_type *reg_type,
+			    u32 *btf_id)
 {
 	struct bpf_insn_access_aux info = {
 		.reg_type = *reg_type,
+		.log = &env->log,
 	};
 
 	if (env->ops->is_valid_access &&
@@ -2354,7 +2378,10 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 */
 		*reg_type = info.reg_type;
 
-		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+		if (*reg_type == PTR_TO_BTF_ID)
+			*btf_id = info.btf_id;
+		else
+			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
@@ -2780,6 +2807,53 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
 	return 0;
 }
 
+static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *regs,
+				   int regno, int off, int size,
+				   enum bpf_access_type atype,
+				   int value_regno)
+{
+	struct bpf_reg_state *reg = regs + regno;
+	const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id);
+	const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+	u32 btf_id;
+	int ret;
+
+	if (atype != BPF_READ) {
+		verbose(env, "only read is supported\n");
+		return -EACCES;
+	}
+
+	if (off < 0) {
+		verbose(env,
+			"R%d is ptr_%s invalid negative access: off=%d\n",
+			regno, tname, off);
+		return -EACCES;
+	}
+	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose(env,
+			"R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n",
+			regno, tname, off, tn_buf);
+		return -EACCES;
+	}
+
+	ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
+	if (ret < 0)
+		return ret;
+
+	if (ret == SCALAR_VALUE) {
+		mark_reg_unknown(env, regs, value_regno);
+		return 0;
+	}
+	mark_reg_known_zero(env, regs, value_regno);
+	regs[value_regno].type = PTR_TO_BTF_ID;
+	regs[value_regno].btf_id = btf_id;
+	return 0;
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -2840,6 +2914,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
+		u32 btf_id = 0;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
@@ -2851,7 +2926,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (err < 0)
 			return err;
 
-		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
+		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf_id);
+		if (err)
+			verbose_linfo(env, insn_idx, "; ");
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
 			 * PTR_TO_PACKET[_META,_END]. In the latter
@@ -2870,6 +2947,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				 * a sub-register.
 				 */
 				regs[value_regno].subreg_def = DEF_NOT_SUBREG;
+				if (reg_type == PTR_TO_BTF_ID)
+					regs[value_regno].btf_id = btf_id;
 			}
 			regs[value_regno].type = reg_type;
 		}
@@ -2929,6 +3008,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_tp_buffer_access(env, reg, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_BTF_ID) {
+		err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
+					      value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 44bd08f2443b..6221e8c6ecc3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1074,7 +1074,7 @@ static bool raw_tp_prog_is_valid_access(int off, int size,
 		return false;
 	if (off % size != 0)
 		return false;
-	return true;
+	return btf_ctx_access(off, size, type, prog, info);
 }
 
 const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From 2a02759ef5f8a34792df22b41d5e10658fd7bbd3 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 Oct 2019 20:25:02 -0700
Subject: bpf: Add support for BTF pointers to interpreter

Pointer to BTF object is a pointer to kernel object or NULL.
The memory access in the interpreter has to be done via probe_kernel_read
to avoid page faults.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-9-ast@kernel.org
---
 include/linux/filter.h |  3 +++
 kernel/bpf/core.c      | 19 +++++++++++++++++++
 kernel/bpf/verifier.c  |  8 ++++++++
 3 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d3d51d7aff2c..22ebea2e64ea 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -65,6 +65,9 @@ struct ctl_table_header;
 /* unused opcode to mark special call to bpf_tail_call() helper */
 #define BPF_TAIL_CALL	0xf0
 
+/* unused opcode to mark special load instruction. Same as BPF_ABS */
+#define BPF_PROBE_MEM	0x20
+
 /* unused opcode to mark call to interpreter with arguments */
 #define BPF_CALL_ARGS	0xe0
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 66088a9e9b9e..8a765bbd33f0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1291,6 +1291,11 @@ bool bpf_opcode_in_insntable(u8 code)
 }
 
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
+u64 __weak bpf_probe_read(void * dst, u32 size, const void * unsafe_ptr)
+{
+	memset(dst, 0, size);
+	return -EFAULT;
+}
 /**
  *	__bpf_prog_run - run eBPF program on a given context
  *	@regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
@@ -1310,6 +1315,10 @@ static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u6
 		/* Non-UAPI available opcodes. */
 		[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
 		[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+		[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
+		[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
+		[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
+		[BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
 	};
 #undef BPF_INSN_3_LBL
 #undef BPF_INSN_2_LBL
@@ -1542,6 +1551,16 @@ out:
 	LDST(W,  u32)
 	LDST(DW, u64)
 #undef LDST
+#define LDX_PROBE(SIZEOP, SIZE)						\
+	LDX_PROBE_MEM_##SIZEOP:						\
+		bpf_probe_read(&DST, SIZE, (const void *)(long) SRC);	\
+		CONT;
+	LDX_PROBE(B,  1)
+	LDX_PROBE(H,  2)
+	LDX_PROBE(W,  4)
+	LDX_PROBE(DW, 8)
+#undef LDX_PROBE
+
 	STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
 		atomic_add((u32) SRC, (atomic_t *)(unsigned long)
 			   (DST + insn->off));
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 42a463e09761..c4b6a2cfcd47 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7581,6 +7581,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
+	case PTR_TO_BTF_ID:
 		return false;
 	default:
 		return true;
@@ -8722,6 +8723,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		case PTR_TO_XDP_SOCK:
 			convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
 			break;
+		case PTR_TO_BTF_ID:
+			if (type == BPF_WRITE) {
+				verbose(env, "Writes through BTF pointers are not allowed\n");
+				return -EINVAL;
+			}
+			insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code);
+			continue;
 		default:
 			continue;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 3dec541b2e632d630fe7142ed44f0b3702ef1f8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 Oct 2019 20:25:03 -0700
Subject: bpf: Add support for BTF pointers to x86 JIT

Pointer to BTF object is a pointer to kernel object or NULL.
Such pointers can only be used by BPF_LDX instructions.
The verifier changed their opcode from LDX|MEM|size
to LDX|PROBE_MEM|size to make JITing easier.
The number of entries in extable is the number of BPF_LDX insns
that access kernel memory via "pointer to BTF type".
Only these load instructions can fault.
Since x86 extable is relative it has to be allocated in the same
memory region as JITed code.
Allocate it prior to last pass of JITing and let the last pass populate it.
Pointer to extable in bpf_prog_aux is necessary to make page fault
handling fast.
Page fault handling is done in two steps:
1. bpf_prog_kallsyms_find() finds BPF program that page faulted.
   It's done by walking rb tree.
2. then extable for given bpf program is binary searched.
This process is similar to how page faulting is done for kernel modules.
The exception handler skips over faulting x86 instruction and
initializes destination register with zero. This mimics exact
behavior of bpf_probe_read (when probe_kernel_read faults dest is zeroed).

JITs for other architectures can add support in similar way.
Until then they will reject unknown opcode and fallback to interpreter.

Since extable should be aligned and placed near JITed code
make bpf_jit_binary_alloc() return 4 byte aligned image offset,
so that extable aligning formula in bpf_int_jit_compile() doesn't need
to rely on internal implementation of bpf_jit_binary_alloc().
On x86 gcc defaults to 16-byte alignment for regular kernel functions
due to better performance. JITed code may be aligned to 16 in the future,
but it will use 4 in the meantime.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-10-ast@kernel.org
---
 arch/x86/net/bpf_jit_comp.c | 97 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/bpf.h         |  3 ++
 include/linux/extable.h     | 10 +++++
 kernel/bpf/core.c           | 20 +++++++++-
 kernel/bpf/verifier.c       |  1 +
 kernel/extable.c            |  2 +
 6 files changed, 128 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3ad2ba1ad855..8cd23d8309bf 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -9,7 +9,7 @@
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
-
+#include <asm/extable.h>
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 
@@ -123,6 +123,19 @@ static const int reg2hex[] = {
 	[AUX_REG] = 3,    /* R11 temp register */
 };
 
+static const int reg2pt_regs[] = {
+	[BPF_REG_0] = offsetof(struct pt_regs, ax),
+	[BPF_REG_1] = offsetof(struct pt_regs, di),
+	[BPF_REG_2] = offsetof(struct pt_regs, si),
+	[BPF_REG_3] = offsetof(struct pt_regs, dx),
+	[BPF_REG_4] = offsetof(struct pt_regs, cx),
+	[BPF_REG_5] = offsetof(struct pt_regs, r8),
+	[BPF_REG_6] = offsetof(struct pt_regs, bx),
+	[BPF_REG_7] = offsetof(struct pt_regs, r13),
+	[BPF_REG_8] = offsetof(struct pt_regs, r14),
+	[BPF_REG_9] = offsetof(struct pt_regs, r15),
+};
+
 /*
  * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
  * which need extra byte of encoding.
@@ -377,6 +390,19 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
 	*pprog = prog;
 }
 
+
+static bool ex_handler_bpf(const struct exception_table_entry *x,
+			   struct pt_regs *regs, int trapnr,
+			   unsigned long error_code, unsigned long fault_addr)
+{
+	u32 reg = x->fixup >> 8;
+
+	/* jump over faulting load and clear dest register */
+	*(unsigned long *)((void *)regs + reg) = 0;
+	regs->ip += x->fixup & 0xff;
+	return true;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -384,7 +410,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	int insn_cnt = bpf_prog->len;
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
-	int i, cnt = 0;
+	int i, cnt = 0, excnt = 0;
 	int proglen = 0;
 	u8 *prog = temp;
 
@@ -778,14 +804,17 @@ stx:			if (is_imm8(insn->off))
 
 			/* LDX: dst_reg = *(u8*)(src_reg + off) */
 		case BPF_LDX | BPF_MEM | BPF_B:
+		case BPF_LDX | BPF_PROBE_MEM | BPF_B:
 			/* Emit 'movzx rax, byte ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_H:
+		case BPF_LDX | BPF_PROBE_MEM | BPF_H:
 			/* Emit 'movzx rax, word ptr [rax + off]' */
 			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_W:
+		case BPF_LDX | BPF_PROBE_MEM | BPF_W:
 			/* Emit 'mov eax, dword ptr [rax+0x14]' */
 			if (is_ereg(dst_reg) || is_ereg(src_reg))
 				EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
@@ -793,6 +822,7 @@ stx:			if (is_imm8(insn->off))
 				EMIT1(0x8B);
 			goto ldx;
 		case BPF_LDX | BPF_MEM | BPF_DW:
+		case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
 			/* Emit 'mov rax, qword ptr [rax+0x14]' */
 			EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
 ldx:			/*
@@ -805,6 +835,48 @@ ldx:			/*
 			else
 				EMIT1_off32(add_2reg(0x80, src_reg, dst_reg),
 					    insn->off);
+			if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
+				struct exception_table_entry *ex;
+				u8 *_insn = image + proglen;
+				s64 delta;
+
+				if (!bpf_prog->aux->extable)
+					break;
+
+				if (excnt >= bpf_prog->aux->num_exentries) {
+					pr_err("ex gen bug\n");
+					return -EFAULT;
+				}
+				ex = &bpf_prog->aux->extable[excnt++];
+
+				delta = _insn - (u8 *)&ex->insn;
+				if (!is_simm32(delta)) {
+					pr_err("extable->insn doesn't fit into 32-bit\n");
+					return -EFAULT;
+				}
+				ex->insn = delta;
+
+				delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler;
+				if (!is_simm32(delta)) {
+					pr_err("extable->handler doesn't fit into 32-bit\n");
+					return -EFAULT;
+				}
+				ex->handler = delta;
+
+				if (dst_reg > BPF_REG_9) {
+					pr_err("verifier error\n");
+					return -EFAULT;
+				}
+				/*
+				 * Compute size of x86 insn and its target dest x86 register.
+				 * ex_handler_bpf() will use lower 8 bits to adjust
+				 * pt_regs->ip to jump over this x86 instruction
+				 * and upper bits to figure out which pt_regs to zero out.
+				 * End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
+				 * of 4 bytes will be ignored and rbx will be zero inited.
+				 */
+				ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
+			}
 			break;
 
 			/* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
@@ -1058,6 +1130,11 @@ emit_jmp:
 		addrs[i] = proglen;
 		prog = temp;
 	}
+
+	if (image && excnt != bpf_prog->aux->num_exentries) {
+		pr_err("extable is not populated\n");
+		return -EFAULT;
+	}
 	return proglen;
 }
 
@@ -1158,12 +1235,24 @@ out_image:
 			break;
 		}
 		if (proglen == oldproglen) {
-			header = bpf_jit_binary_alloc(proglen, &image,
-						      1, jit_fill_hole);
+			/*
+			 * The number of entries in extable is the number of BPF_LDX
+			 * insns that access kernel memory via "pointer to BTF type".
+			 * The verifier changed their opcode from LDX|MEM|size
+			 * to LDX|PROBE_MEM|size to make JITing easier.
+			 */
+			u32 align = __alignof__(struct exception_table_entry);
+			u32 extable_size = prog->aux->num_exentries *
+				sizeof(struct exception_table_entry);
+
+			/* allocate module memory for x86 insns and extable */
+			header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
+						      &image, align, jit_fill_hole);
 			if (!header) {
 				prog = orig_prog;
 				goto out_addrs;
 			}
+			prog->aux->extable = (void *) image + roundup(proglen, align);
 		}
 		oldproglen = proglen;
 		cond_resched();
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 028555fcd10d..a7330d75bb94 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -24,6 +24,7 @@ struct sock;
 struct seq_file;
 struct btf;
 struct btf_type;
+struct exception_table_entry;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
@@ -423,6 +424,8 @@ struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	u32 num_exentries;
+	struct exception_table_entry *extable;
 	struct bpf_prog_stats __percpu *stats;
 	union {
 		struct work_struct work;
diff --git a/include/linux/extable.h b/include/linux/extable.h
index 81ecfaa83ad3..4ab9e78f313b 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -33,4 +33,14 @@ search_module_extables(unsigned long addr)
 }
 #endif /*CONFIG_MODULES*/
 
+#ifdef CONFIG_BPF_JIT
+const struct exception_table_entry *search_bpf_extables(unsigned long addr);
+#else
+static inline const struct exception_table_entry *
+search_bpf_extables(unsigned long addr)
+{
+	return NULL;
+}
+#endif
+
 #endif /* _LINUX_EXTABLE_H */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8a765bbd33f0..673f5d40a93e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -30,7 +30,7 @@
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
 #include <linux/perf_event.h>
-
+#include <linux/extable.h>
 #include <asm/unaligned.h>
 
 /* Registers */
@@ -712,6 +712,24 @@ bool is_bpf_text_address(unsigned long addr)
 	return ret;
 }
 
+const struct exception_table_entry *search_bpf_extables(unsigned long addr)
+{
+	const struct exception_table_entry *e = NULL;
+	struct bpf_prog *prog;
+
+	rcu_read_lock();
+	prog = bpf_prog_kallsyms_find(addr);
+	if (!prog)
+		goto out;
+	if (!prog->aux->num_exentries)
+		goto out;
+
+	e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
+out:
+	rcu_read_unlock();
+	return e;
+}
+
 int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		    char *sym)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c4b6a2cfcd47..fba9ef6a831b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8729,6 +8729,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 				return -EINVAL;
 			}
 			insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code);
+			env->prog->aux->num_exentries++;
 			continue;
 		default:
 			continue;
diff --git a/kernel/extable.c b/kernel/extable.c
index f6c9406eec7d..f6920a11e28a 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -56,6 +56,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	e = search_kernel_exception_table(addr);
 	if (!e)
 		e = search_module_extables(addr);
+	if (!e)
+		e = search_bpf_extables(addr);
 	return e;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a7658e1a4164ce2b9eb4a11aadbba38586e93bd6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 15 Oct 2019 20:25:04 -0700
Subject: bpf: Check types of arguments passed into helpers

Introduce new helper that reuses existing skb perf_event output
implementation, but can be called from raw_tracepoint programs
that receive 'struct sk_buff *' as tracepoint argument or
can walk other kernel data structures to skb pointer.

In order to do that teach verifier to resolve true C types
of bpf helpers into in-kernel BTF ids.
The type of kernel pointer passed by raw tracepoint into bpf
program will be tracked by the verifier all the way until
it's passed into helper function.
For example:
kfree_skb() kernel function calls trace_kfree_skb(skb, loc);
bpf programs receives that skb pointer and may eventually
pass it into bpf_skb_output() bpf helper which in-kernel is
implemented via bpf_skb_event_output() kernel function.
Its first argument in the kernel is 'struct sk_buff *'.
The verifier makes sure that types match all the way.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191016032505.2089704-11-ast@kernel.org
---
 include/linux/bpf.h            | 18 +++++++----
 include/uapi/linux/bpf.h       | 27 ++++++++++++++++-
 kernel/bpf/btf.c               | 68 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 44 +++++++++++++++++----------
 kernel/trace/bpf_trace.c       |  4 +++
 net/core/filter.c              | 15 +++++++++-
 tools/include/uapi/linux/bpf.h | 27 ++++++++++++++++-
 7 files changed, 180 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a7330d75bb94..2c2c29b49845 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -213,6 +213,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_INT,		/* pointer to int */
 	ARG_PTR_TO_LONG,	/* pointer to long */
 	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
+	ARG_PTR_TO_BTF_ID,	/* pointer to in-kernel struct */
 };
 
 /* type of values returned from helper functions */
@@ -235,11 +236,17 @@ struct bpf_func_proto {
 	bool gpl_only;
 	bool pkt_access;
 	enum bpf_return_type ret_type;
-	enum bpf_arg_type arg1_type;
-	enum bpf_arg_type arg2_type;
-	enum bpf_arg_type arg3_type;
-	enum bpf_arg_type arg4_type;
-	enum bpf_arg_type arg5_type;
+	union {
+		struct {
+			enum bpf_arg_type arg1_type;
+			enum bpf_arg_type arg2_type;
+			enum bpf_arg_type arg3_type;
+			enum bpf_arg_type arg4_type;
+			enum bpf_arg_type arg5_type;
+		};
+		enum bpf_arg_type arg_type[5];
+	};
+	u32 *btf_id; /* BTF ids of arguments */
 };
 
 /* bpf_context is intentionally undefined structure. Pointer to bpf_context is
@@ -765,6 +772,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
 		      const struct btf_type *t, int off, int size,
 		      enum bpf_access_type atype,
 		      u32 *next_btf_id);
+u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int);
 
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3bb2cd1de341..4af8b0819a32 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2751,6 +2751,30 @@ union bpf_attr {
  *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
  *
  *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write raw *data* blob into a special BPF perf event held by
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 		event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 		as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		*ctx* is a pointer to in-kernel struct sk_buff.
+ *
+ * 		This helper is similar to **bpf_perf_event_output**\ () but
+ * 		restricted to raw_tracepoint bpf programs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2863,7 +2887,8 @@ union bpf_attr {
 	FN(sk_storage_get),		\
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
-	FN(tcp_gen_syncookie),
+	FN(tcp_gen_syncookie),		\
+	FN(skb_output),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 271d27cd427f..f7557af39756 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3626,6 +3626,74 @@ again:
 	return -EINVAL;
 }
 
+u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg)
+{
+	char fnname[KSYM_SYMBOL_LEN + 4] = "btf_";
+	const struct btf_param *args;
+	const struct btf_type *t;
+	const char *tname, *sym;
+	u32 btf_id, i;
+
+	if (IS_ERR(btf_vmlinux)) {
+		bpf_log(log, "btf_vmlinux is malformed\n");
+		return -EINVAL;
+	}
+
+	sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4);
+	if (!sym) {
+		bpf_log(log, "kernel doesn't have kallsyms\n");
+		return -EFAULT;
+	}
+
+	for (i = 1; i <= btf_vmlinux->nr_types; i++) {
+		t = btf_type_by_id(btf_vmlinux, i);
+		if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF)
+			continue;
+		tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
+		if (!strcmp(tname, fnname))
+			break;
+	}
+	if (i > btf_vmlinux->nr_types) {
+		bpf_log(log, "helper %s type is not found\n", fnname);
+		return -ENOENT;
+	}
+
+	t = btf_type_by_id(btf_vmlinux, t->type);
+	if (!btf_type_is_ptr(t))
+		return -EFAULT;
+	t = btf_type_by_id(btf_vmlinux, t->type);
+	if (!btf_type_is_func_proto(t))
+		return -EFAULT;
+
+	args = (const struct btf_param *)(t + 1);
+	if (arg >= btf_type_vlen(t)) {
+		bpf_log(log, "bpf helper %s doesn't have %d-th argument\n",
+			fnname, arg);
+		return -EINVAL;
+	}
+
+	t = btf_type_by_id(btf_vmlinux, args[arg].type);
+	if (!btf_type_is_ptr(t) || !t->type) {
+		/* anything but the pointer to struct is a helper config bug */
+		bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n");
+		return -EFAULT;
+	}
+	btf_id = t->type;
+	t = btf_type_by_id(btf_vmlinux, t->type);
+	/* skip modifiers */
+	while (btf_type_is_modifier(t)) {
+		btf_id = t->type;
+		t = btf_type_by_id(btf_vmlinux, t->type);
+	}
+	if (!btf_type_is_struct(t)) {
+		bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n");
+		return -EFAULT;
+	}
+	bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4,
+		arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off));
+	return btf_id;
+}
+
 void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fba9ef6a831b..556e82f8869b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -205,6 +205,7 @@ struct bpf_call_arg_meta {
 	u64 msize_umax_value;
 	int ref_obj_id;
 	int func_id;
+	u32 btf_id;
 };
 
 struct btf *btf_vmlinux;
@@ -3439,6 +3440,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_SOCKET;
 		if (type != expected_type)
 			goto err_type;
+	} else if (arg_type == ARG_PTR_TO_BTF_ID) {
+		expected_type = PTR_TO_BTF_ID;
+		if (type != expected_type)
+			goto err_type;
+		if (reg->btf_id != meta->btf_id) {
+			verbose(env, "Helper has type %s got %s in R%d\n",
+				kernel_type_name(meta->btf_id),
+				kernel_type_name(reg->btf_id), regno);
+
+			return -EACCES;
+		}
+		if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) {
+			verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
+				regno);
+			return -EACCES;
+		}
 	} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
 		if (meta->func_id == BPF_FUNC_spin_lock) {
 			if (process_spin_lock(env, regno, true))
@@ -3586,6 +3603,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 		if (func_id != BPF_FUNC_perf_event_read &&
 		    func_id != BPF_FUNC_perf_event_output &&
+		    func_id != BPF_FUNC_skb_output &&
 		    func_id != BPF_FUNC_perf_event_read_value)
 			goto error;
 		break;
@@ -3673,6 +3691,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_perf_event_read:
 	case BPF_FUNC_perf_event_output:
 	case BPF_FUNC_perf_event_read_value:
+	case BPF_FUNC_skb_output:
 		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			goto error;
 		break;
@@ -4127,21 +4146,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
 	meta.func_id = func_id;
 	/* check args */
-	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
-	if (err)
-		return err;
-	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
-	if (err)
-		return err;
-	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
-	if (err)
-		return err;
-	err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta);
-	if (err)
-		return err;
-	err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta);
-	if (err)
-		return err;
+	for (i = 0; i < 5; i++) {
+		if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID) {
+			if (!fn->btf_id[i])
+				fn->btf_id[i] = btf_resolve_helper_id(&env->log, fn->func, i);
+			meta.btf_id = fn->btf_id[i];
+		}
+		err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta);
+		if (err)
+			return err;
+	}
 
 	err = record_func_map(env, &meta, func_id, insn_idx);
 	if (err)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6221e8c6ecc3..52f7e9d8c29b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -995,6 +995,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+extern const struct bpf_func_proto bpf_skb_output_proto;
+
 BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
 	   struct bpf_map *, map, u64, flags)
 {
@@ -1053,6 +1055,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto_raw_tp;
+	case BPF_FUNC_skb_output:
+		return &bpf_skb_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
 	case BPF_FUNC_get_stack:
diff --git a/net/core/filter.c b/net/core/filter.c
index 46196e212413..728ba6203c1f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3798,7 +3798,7 @@ BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
 
 	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
 		return -EINVAL;
-	if (unlikely(skb_size > skb->len))
+	if (unlikely(!skb || skb_size > skb->len))
 		return -EFAULT;
 
 	return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
@@ -3816,6 +3816,19 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+static u32 bpf_skb_output_btf_ids[5];
+const struct bpf_func_proto bpf_skb_output_proto = {
+	.func		= bpf_skb_event_output,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
+	.btf_id		= bpf_skb_output_btf_ids,
+};
+
 static unsigned short bpf_tunnel_key_af(u64 flags)
 {
 	return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3bb2cd1de341..4af8b0819a32 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2751,6 +2751,30 @@ union bpf_attr {
  *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
  *
  *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write raw *data* blob into a special BPF perf event held by
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 		event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 		as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		*ctx* is a pointer to in-kernel struct sk_buff.
+ *
+ * 		This helper is similar to **bpf_perf_event_output**\ () but
+ * 		restricted to raw_tracepoint bpf programs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2863,7 +2887,8 @@ union bpf_attr {
 	FN(sk_storage_get),		\
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
-	FN(tcp_gen_syncookie),
+	FN(tcp_gen_syncookie),		\
+	FN(skb_output),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3-59-g8ed1b


From 68bb8ea8ad0d497c28ed47423246b1ab20f26976 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Mon, 21 Oct 2019 16:51:15 -0400
Subject: net: dsa: use dsa_to_port helper everywhere

Do not let the drivers access the ds->ports static array directly
while there is a dsa_to_port helper for this purpose.

At the same time, un-const this helper since the SJA1105 driver
assigns the priv member of the returned dsa_port structure.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/dsa/b53/b53_common.c       |  6 +++---
 drivers/net/dsa/bcm_sf2.c              |  8 ++++----
 drivers/net/dsa/bcm_sf2_cfp.c          |  6 +++---
 drivers/net/dsa/mt7530.c               | 12 ++++++------
 drivers/net/dsa/mv88e6xxx/chip.c       | 10 +++++-----
 drivers/net/dsa/qca8k.c                |  2 +-
 drivers/net/dsa/sja1105/sja1105_main.c | 18 +++++++++---------
 include/net/dsa.h                      |  2 +-
 net/dsa/dsa.c                          |  8 +++++---
 net/dsa/dsa2.c                         |  4 ++--
 net/dsa/switch.c                       |  4 ++--
 net/dsa/tag_8021q.c                    |  6 +++---
 12 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index cc3536315eff..aef9b56781ef 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -524,7 +524,7 @@ int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy)
 	if (!dsa_is_user_port(ds, port))
 		return 0;
 
-	cpu_port = ds->ports[port].cpu_dp->index;
+	cpu_port = dsa_to_port(ds, port)->cpu_dp->index;
 
 	if (dev->ops->irq_enable)
 		ret = dev->ops->irq_enable(dev, port);
@@ -1629,7 +1629,7 @@ EXPORT_SYMBOL(b53_fdb_dump);
 int b53_br_join(struct dsa_switch *ds, int port, struct net_device *br)
 {
 	struct b53_device *dev = ds->priv;
-	s8 cpu_port = ds->ports[port].cpu_dp->index;
+	s8 cpu_port = dsa_to_port(ds, port)->cpu_dp->index;
 	u16 pvlan, reg;
 	unsigned int i;
 
@@ -1675,7 +1675,7 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *br)
 {
 	struct b53_device *dev = ds->priv;
 	struct b53_vlan *vl = &dev->vlans[0];
-	s8 cpu_port = ds->ports[port].cpu_dp->index;
+	s8 cpu_port = dsa_to_port(ds, port)->cpu_dp->index;
 	unsigned int i;
 	u16 pvlan, reg, pvid;
 
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 26509fa37a50..c068a3b7207b 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -662,7 +662,7 @@ static void bcm_sf2_sw_fixed_state(struct dsa_switch *ds, int port,
 		 * state machine and make it go in PHY_FORCING state instead.
 		 */
 		if (!status->link)
-			netif_carrier_off(ds->ports[port].slave);
+			netif_carrier_off(dsa_to_port(ds, port)->slave);
 		status->duplex = DUPLEX_FULL;
 	} else {
 		status->link = true;
@@ -728,7 +728,7 @@ static int bcm_sf2_sw_resume(struct dsa_switch *ds)
 static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 			       struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = ds->ports[port].cpu_dp->master;
+	struct net_device *p = dsa_to_port(ds, port)->cpu_dp->master;
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	struct ethtool_wolinfo pwol = { };
 
@@ -752,9 +752,9 @@ static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port,
 static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port,
 			      struct ethtool_wolinfo *wol)
 {
-	struct net_device *p = ds->ports[port].cpu_dp->master;
+	struct net_device *p = dsa_to_port(ds, port)->cpu_dp->master;
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	s8 cpu_port = ds->ports[port].cpu_dp->index;
+	s8 cpu_port = dsa_to_port(ds, port)->cpu_dp->index;
 	struct ethtool_wolinfo pwol =  { };
 
 	if (p->ethtool_ops->get_wol)
diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index d264776a95a3..f3f0c3f07391 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -821,7 +821,7 @@ static int bcm_sf2_cfp_rule_insert(struct dsa_switch *ds, int port,
 				   struct ethtool_rx_flow_spec *fs)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
-	s8 cpu_port = ds->ports[port].cpu_dp->index;
+	s8 cpu_port = dsa_to_port(ds, port)->cpu_dp->index;
 	__u64 ring_cookie = fs->ring_cookie;
 	unsigned int queue_num, port_num;
 	int ret;
@@ -1049,7 +1049,7 @@ static int bcm_sf2_cfp_rule_get_all(struct bcm_sf2_priv *priv,
 int bcm_sf2_get_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc, u32 *rule_locs)
 {
-	struct net_device *p = ds->ports[port].cpu_dp->master;
+	struct net_device *p = dsa_to_port(ds, port)->cpu_dp->master;
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	int ret = 0;
 
@@ -1092,7 +1092,7 @@ int bcm_sf2_get_rxnfc(struct dsa_switch *ds, int port,
 int bcm_sf2_set_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc)
 {
-	struct net_device *p = ds->ports[port].cpu_dp->master;
+	struct net_device *p = dsa_to_port(ds, port)->cpu_dp->master;
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	int ret = 0;
 
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 1d8d36de4d20..a91293e47a57 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -862,7 +862,7 @@ mt7530_port_set_vlan_unaware(struct dsa_switch *ds, int port)
 
 	for (i = 0; i < MT7530_NUM_PORTS; i++) {
 		if (dsa_is_user_port(ds, i) &&
-		    dsa_port_is_vlan_filtering(&ds->ports[i])) {
+		    dsa_port_is_vlan_filtering(dsa_to_port(ds, i))) {
 			all_user_ports_removed = false;
 			break;
 		}
@@ -922,7 +922,7 @@ mt7530_port_bridge_leave(struct dsa_switch *ds, int port,
 		 * other port is still a VLAN-aware port.
 		 */
 		if (dsa_is_user_port(ds, i) && i != port &&
-		   !dsa_port_is_vlan_filtering(&ds->ports[i])) {
+		   !dsa_port_is_vlan_filtering(dsa_to_port(ds, i))) {
 			if (dsa_to_port(ds, i)->bridge_dev != bridge)
 				continue;
 			if (priv->ports[i].enable)
@@ -1165,7 +1165,7 @@ mt7530_port_vlan_add(struct dsa_switch *ds, int port,
 	/* The port is kept as VLAN-unaware if bridge with vlan_filtering not
 	 * being set.
 	 */
-	if (!dsa_port_is_vlan_filtering(&ds->ports[port]))
+	if (!dsa_port_is_vlan_filtering(dsa_to_port(ds, port)))
 		return;
 
 	mutex_lock(&priv->reg_mutex);
@@ -1196,7 +1196,7 @@ mt7530_port_vlan_del(struct dsa_switch *ds, int port,
 	/* The port is kept as VLAN-unaware if bridge with vlan_filtering not
 	 * being set.
 	 */
-	if (!dsa_port_is_vlan_filtering(&ds->ports[port]))
+	if (!dsa_port_is_vlan_filtering(dsa_to_port(ds, port)))
 		return 0;
 
 	mutex_lock(&priv->reg_mutex);
@@ -1252,7 +1252,7 @@ mt7530_setup(struct dsa_switch *ds)
 	 * controller also is the container for two GMACs nodes representing
 	 * as two netdev instances.
 	 */
-	dn = ds->ports[MT7530_CPU_PORT].master->dev.of_node->parent;
+	dn = dsa_to_port(ds, MT7530_CPU_PORT)->master->dev.of_node->parent;
 
 	if (priv->id == ID_MT7530) {
 		priv->ethernet = syscon_node_to_regmap(dn);
@@ -1340,7 +1340,7 @@ mt7530_setup(struct dsa_switch *ds)
 
 	if (!dsa_is_unused_port(ds, 5)) {
 		priv->p5_intf_sel = P5_INTF_SEL_GMAC5;
-		interface = of_get_phy_mode(ds->ports[5].dn);
+		interface = of_get_phy_mode(dsa_to_port(ds, 5)->dn);
 	} else {
 		/* Scan the ethernet nodes. look for GMAC1, lookup used phy */
 		for_each_child_of_node(dn, mac_np) {
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 6787d560e9e3..d67deec77452 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1075,7 +1075,7 @@ static u16 mv88e6xxx_port_vlan(struct mv88e6xxx_chip *chip, int dev, int port)
 	if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
 		return mv88e6xxx_port_mask(chip);
 
-	br = ds->ports[port].bridge_dev;
+	br = dsa_to_port(ds, port)->bridge_dev;
 	pvlan = 0;
 
 	/* Frames from user ports can egress any local DSA links and CPU ports,
@@ -1402,7 +1402,7 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port,
 			if (dsa_is_dsa_port(ds, i) || dsa_is_cpu_port(ds, i))
 				continue;
 
-			if (!ds->ports[i].slave)
+			if (!dsa_to_port(ds, i)->slave)
 				continue;
 
 			if (vlan.member[i] ==
@@ -1410,7 +1410,7 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port,
 				continue;
 
 			if (dsa_to_port(ds, i)->bridge_dev ==
-			    ds->ports[port].bridge_dev)
+			    dsa_to_port(ds, port)->bridge_dev)
 				break; /* same bridge, check next VLAN */
 
 			if (!dsa_to_port(ds, i)->bridge_dev)
@@ -2042,7 +2042,7 @@ static int mv88e6xxx_bridge_map(struct mv88e6xxx_chip *chip,
 
 	/* Remap the Port VLAN of each local bridge group member */
 	for (port = 0; port < mv88e6xxx_num_ports(chip); ++port) {
-		if (chip->ds->ports[port].bridge_dev == br) {
+		if (dsa_to_port(chip->ds, port)->bridge_dev == br) {
 			err = mv88e6xxx_port_vlan_map(chip, port);
 			if (err)
 				return err;
@@ -2059,7 +2059,7 @@ static int mv88e6xxx_bridge_map(struct mv88e6xxx_chip *chip,
 			break;
 
 		for (port = 0; port < ds->num_ports; ++port) {
-			if (ds->ports[port].bridge_dev == br) {
+			if (dsa_to_port(ds, port)->bridge_dev == br) {
 				err = mv88e6xxx_pvt_map(chip, dev, port);
 				if (err)
 					return err;
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index b00274caae4f..71e44c8763b8 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -661,7 +661,7 @@ qca8k_setup(struct dsa_switch *ds)
 		return ret;
 
 	/* Initialize CPU port pad mode (xMII type, delays...) */
-	phy_mode = of_get_phy_mode(ds->ports[QCA8K_CPU_PORT].dn);
+	phy_mode = of_get_phy_mode(dsa_to_port(ds, QCA8K_CPU_PORT)->dn);
 	if (phy_mode < 0) {
 		pr_err("Can't find phy-mode for master device\n");
 		return phy_mode;
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 2ffe642cf54b..4b0cb779f187 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1058,7 +1058,7 @@ int sja1105pqrs_fdb_add(struct dsa_switch *ds, int port,
 	l2_lookup.vlanid = vid;
 	l2_lookup.iotag = SJA1105_S_TAG;
 	l2_lookup.mask_macaddr = GENMASK_ULL(ETH_ALEN * 8 - 1, 0);
-	if (dsa_port_is_vlan_filtering(&ds->ports[port])) {
+	if (dsa_port_is_vlan_filtering(dsa_to_port(ds, port))) {
 		l2_lookup.mask_vlanid = VLAN_VID_MASK;
 		l2_lookup.mask_iotag = BIT(0);
 	} else {
@@ -1121,7 +1121,7 @@ int sja1105pqrs_fdb_del(struct dsa_switch *ds, int port,
 	l2_lookup.vlanid = vid;
 	l2_lookup.iotag = SJA1105_S_TAG;
 	l2_lookup.mask_macaddr = GENMASK_ULL(ETH_ALEN * 8 - 1, 0);
-	if (dsa_port_is_vlan_filtering(&ds->ports[port])) {
+	if (dsa_port_is_vlan_filtering(dsa_to_port(ds, port))) {
 		l2_lookup.mask_vlanid = VLAN_VID_MASK;
 		l2_lookup.mask_iotag = BIT(0);
 	} else {
@@ -1167,7 +1167,7 @@ static int sja1105_fdb_add(struct dsa_switch *ds, int port,
 	 * for what gets printed in 'bridge fdb show'.  In the case of zero,
 	 * no VID gets printed at all.
 	 */
-	if (!dsa_port_is_vlan_filtering(&ds->ports[port]))
+	if (!dsa_port_is_vlan_filtering(dsa_to_port(ds, port)))
 		vid = 0;
 
 	return priv->info->fdb_add_cmd(ds, port, addr, vid);
@@ -1178,7 +1178,7 @@ static int sja1105_fdb_del(struct dsa_switch *ds, int port,
 {
 	struct sja1105_private *priv = ds->priv;
 
-	if (!dsa_port_is_vlan_filtering(&ds->ports[port]))
+	if (!dsa_port_is_vlan_filtering(dsa_to_port(ds, port)))
 		vid = 0;
 
 	return priv->info->fdb_del_cmd(ds, port, addr, vid);
@@ -1217,7 +1217,7 @@ static int sja1105_fdb_dump(struct dsa_switch *ds, int port,
 		u64_to_ether_addr(l2_lookup.macaddr, macaddr);
 
 		/* We need to hide the dsa_8021q VLANs from the user. */
-		if (!dsa_port_is_vlan_filtering(&ds->ports[port]))
+		if (!dsa_port_is_vlan_filtering(dsa_to_port(ds, port)))
 			l2_lookup.vlanid = 0;
 		cb(macaddr, l2_lookup.vlanid, l2_lookup.lockeds, data);
 	}
@@ -1704,7 +1704,7 @@ static int sja1105_port_enable(struct dsa_switch *ds, int port,
 	if (!dsa_is_user_port(ds, port))
 		return 0;
 
-	slave = ds->ports[port].slave;
+	slave = dsa_to_port(ds, port)->slave;
 
 	slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
 
@@ -1736,7 +1736,7 @@ static int sja1105_mgmt_xmit(struct dsa_switch *ds, int port, int slot,
 	}
 
 	/* Transfer skb to the host port. */
-	dsa_enqueue_skb(skb, ds->ports[port].slave);
+	dsa_enqueue_skb(skb, dsa_to_port(ds, port)->slave);
 
 	/* Wait until the switch has processed the frame */
 	do {
@@ -2061,8 +2061,8 @@ static int sja1105_probe(struct spi_device *spi)
 	for (i = 0; i < SJA1105_NUM_PORTS; i++) {
 		struct sja1105_port *sp = &priv->ports[i];
 
-		ds->ports[i].priv = sp;
-		sp->dp = &ds->ports[i];
+		dsa_to_port(ds, i)->priv = sp;
+		sp->dp = dsa_to_port(ds, i);
 		sp->data = tagger_data;
 	}
 	mutex_init(&priv->ptp_data.lock);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 8c3ea0530f65..2e4fe2f8962b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -278,7 +278,7 @@ struct dsa_switch {
 	struct dsa_port ports[];
 };
 
-static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
+static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
 {
 	return &ds->ports[p];
 }
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 43120a3fb06f..a5545762f5e7 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -246,7 +246,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 #ifdef CONFIG_PM_SLEEP
 static bool dsa_is_port_initialized(struct dsa_switch *ds, int p)
 {
-	return dsa_is_user_port(ds, p) && ds->ports[p].slave;
+	const struct dsa_port *dp = dsa_to_port(ds, p);
+
+	return dp->type == DSA_PORT_TYPE_USER && dp->slave;
 }
 
 int dsa_switch_suspend(struct dsa_switch *ds)
@@ -258,7 +260,7 @@ int dsa_switch_suspend(struct dsa_switch *ds)
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
-		ret = dsa_slave_suspend(ds->ports[i].slave);
+		ret = dsa_slave_suspend(dsa_to_port(ds, i)->slave);
 		if (ret)
 			return ret;
 	}
@@ -285,7 +287,7 @@ int dsa_switch_resume(struct dsa_switch *ds)
 		if (!dsa_is_port_initialized(ds, i))
 			continue;
 
-		ret = dsa_slave_resume(ds->ports[i].slave);
+		ret = dsa_slave_resume(dsa_to_port(ds, i)->slave);
 		if (ret)
 			return ret;
 	}
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 716d265ba8ca..1716535167ee 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -708,7 +708,7 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds,
 			goto out_put_node;
 		}
 
-		dp = &ds->ports[reg];
+		dp = dsa_to_port(ds, reg);
 
 		err = dsa_port_parse_of(dp, port);
 		if (err)
@@ -787,7 +787,7 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds,
 	for (i = 0; i < DSA_MAX_PORTS; i++) {
 		name = cd->port_names[i];
 		dev = cd->netdev[i];
-		dp = &ds->ports[i];
+		dp = dsa_to_port(ds, i);
 
 		if (!name)
 			continue;
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 6a9607518823..df4abe897ed6 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -20,7 +20,7 @@ static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
 	int i;
 
 	for (i = 0; i < ds->num_ports; ++i) {
-		struct dsa_port *dp = &ds->ports[i];
+		struct dsa_port *dp = dsa_to_port(ds, i);
 
 		if (dp->ageing_time && dp->ageing_time < ageing_time)
 			ageing_time = dp->ageing_time;
@@ -98,7 +98,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
 	if (unset_vlan_filtering) {
 		struct switchdev_trans trans = {0};
 
-		err = dsa_port_vlan_filtering(&ds->ports[info->port],
+		err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port),
 					      false, &trans);
 		if (err && err != EOPNOTSUPP)
 			return err;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 9c1cc2482b68..bf91fc55fc44 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -103,7 +103,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port)
 	if (!dsa_is_user_port(ds, port))
 		return 0;
 
-	slave = ds->ports[port].slave;
+	slave = dsa_to_port(ds, port)->slave;
 
 	err = br_vlan_get_pvid(slave, &pvid);
 	if (err < 0)
@@ -118,7 +118,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port)
 		return err;
 	}
 
-	return dsa_port_vid_add(&ds->ports[port], pvid, vinfo.flags);
+	return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags);
 }
 
 /* If @enabled is true, installs @vid with @flags into the switch port's HW
@@ -130,7 +130,7 @@ static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port)
 static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid,
 			       u16 flags, bool enabled)
 {
-	struct dsa_port *dp = &ds->ports[port];
+	struct dsa_port *dp = dsa_to_port(ds, port);
 	struct bridge_vlan_info vinfo;
 	int err;
 
-- 
cgit v1.2.3-59-g8ed1b


From ab8ccae122a41530a89bc899ace0e46defb156a8 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Mon, 21 Oct 2019 16:51:16 -0400
Subject: net: dsa: add ports list in the switch fabric

Add a list of switch ports within the switch fabric. This will help the
lookup of a port inside the whole fabric, and it is the first step
towards supporting multiple CPU ports, before deprecating the usage of
the unique dst->cpu_dp pointer.

In preparation for a future allocation of the dsa_port structures,
return -ENOMEM in case no structure is returned, even though this
error cannot be reached yet.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/dsa.h |  5 +++++
 net/dsa/dsa2.c    | 48 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2e4fe2f8962b..6ff6dfcdc61d 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -125,6 +125,9 @@ struct dsa_switch_tree {
 	 */
 	struct dsa_port		*cpu_dp;
 
+	/* List of switch ports */
+	struct list_head ports;
+
 	/*
 	 * Data for the individual switch chips.
 	 */
@@ -195,6 +198,8 @@ struct dsa_port {
 	struct work_struct	xmit_work;
 	struct sk_buff_head	xmit_queue;
 
+	struct list_head list;
+
 	/*
 	 * Give the switch driver somewhere to hang its per-port private data
 	 * structures (accessible from the tagger).
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 1716535167ee..ba27ff8b4445 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -45,6 +45,8 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index)
 
 	dst->index = index;
 
+	INIT_LIST_HEAD(&dst->ports);
+
 	INIT_LIST_HEAD(&dst->list);
 	list_add_tail(&dst->list, &dsa_tree_list);
 
@@ -616,6 +618,22 @@ static int dsa_tree_add_switch(struct dsa_switch_tree *dst,
 	return err;
 }
 
+static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
+{
+	struct dsa_switch_tree *dst = ds->dst;
+	struct dsa_port *dp;
+
+	dp = &ds->ports[index];
+
+	dp->ds = ds;
+	dp->index = index;
+
+	INIT_LIST_HEAD(&dp->list);
+	list_add_tail(&dp->list, &dst->ports);
+
+	return dp;
+}
+
 static int dsa_port_parse_user(struct dsa_port *dp, const char *name)
 {
 	if (!name)
@@ -742,6 +760,20 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds,
 	return 0;
 }
 
+static int dsa_switch_touch_ports(struct dsa_switch *ds)
+{
+	struct dsa_port *dp;
+	int port;
+
+	for (port = 0; port < ds->num_ports; port++) {
+		dp = dsa_port_touch(ds, port);
+		if (!dp)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
 static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
 {
 	int err;
@@ -750,6 +782,10 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
 	if (err)
 		return err;
 
+	err = dsa_switch_touch_ports(ds);
+	if (err)
+		return err;
+
 	return dsa_switch_parse_ports_of(ds, dn);
 }
 
@@ -807,6 +843,8 @@ static int dsa_switch_parse_ports(struct dsa_switch *ds,
 
 static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
 {
+	int err;
+
 	ds->cd = cd;
 
 	/* We don't support interconnected switches nor multiple trees via
@@ -817,6 +855,10 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
 	if (!ds->dst)
 		return -ENOMEM;
 
+	err = dsa_switch_touch_ports(ds);
+	if (err)
+		return err;
+
 	return dsa_switch_parse_ports(ds, cd);
 }
 
@@ -849,7 +891,6 @@ static int dsa_switch_probe(struct dsa_switch *ds)
 struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
 {
 	struct dsa_switch *ds;
-	int i;
 
 	ds = devm_kzalloc(dev, struct_size(ds, ports, n), GFP_KERNEL);
 	if (!ds)
@@ -858,11 +899,6 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
 	ds->dev = dev;
 	ds->num_ports = n;
 
-	for (i = 0; i < ds->num_ports; ++i) {
-		ds->ports[i].index = i;
-		ds->ports[i].ds = ds;
-	}
-
 	return ds;
 }
 EXPORT_SYMBOL_GPL(dsa_switch_alloc);
-- 
cgit v1.2.3-59-g8ed1b


From b96ddf254b09447c6b79632cdc02dae3f2454a82 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Mon, 21 Oct 2019 16:51:17 -0400
Subject: net: dsa: use ports list in dsa_to_port

Use the new ports list instead of accessing the dsa_switch array
of ports in the dsa_to_port helper.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/dsa.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 6ff6dfcdc61d..d2b7ee28f3fd 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -285,7 +285,14 @@ struct dsa_switch {
 
 static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
 {
-	return &ds->ports[p];
+	struct dsa_switch_tree *dst = ds->dst;
+	struct dsa_port *dp = NULL;
+
+	list_for_each_entry(dp, &dst->ports, list)
+		if (dp->ds == ds && dp->index == p)
+			break;
+
+	return dp;
 }
 
 static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
-- 
cgit v1.2.3-59-g8ed1b


From fb35c60cbacc67a6075fb8e3d98fa348665662fe Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Mon, 21 Oct 2019 16:51:19 -0400
Subject: net: dsa: use ports list to setup switches

Use the new ports list instead of iterating over switches and their
ports when setting up the switches and their ports.

At the same time, provide setup states and messages for ports and
switches as it is done for the trees.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/dsa.h |  4 +++
 net/dsa/dsa2.c    | 93 ++++++++++++++++++++++++-------------------------------
 2 files changed, 45 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index d2b7ee28f3fd..bd08bdee8341 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -215,9 +215,13 @@ struct dsa_port {
 	 * Original copy of the master netdev net_device_ops
 	 */
 	const struct net_device_ops *orig_ndo_ops;
+
+	bool setup;
 };
 
 struct dsa_switch {
+	bool setup;
+
 	struct device *dev;
 
 	/*
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index ba27ff8b4445..01b6047d9b7b 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -267,6 +267,9 @@ static int dsa_port_setup(struct dsa_port *dp)
 	bool dsa_port_enabled = false;
 	int err = 0;
 
+	if (dp->setup)
+		return 0;
+
 	switch (dp->type) {
 	case DSA_PORT_TYPE_UNUSED:
 		dsa_port_disable(dp);
@@ -335,14 +338,21 @@ static int dsa_port_setup(struct dsa_port *dp)
 		dsa_port_link_unregister_of(dp);
 	if (err && devlink_port_registered)
 		devlink_port_unregister(dlp);
+	if (err)
+		return err;
 
-	return err;
+	dp->setup = true;
+
+	return 0;
 }
 
 static void dsa_port_teardown(struct dsa_port *dp)
 {
 	struct devlink_port *dlp = &dp->devlink_port;
 
+	if (!dp->setup)
+		return;
+
 	switch (dp->type) {
 	case DSA_PORT_TYPE_UNUSED:
 		break;
@@ -365,11 +375,16 @@ static void dsa_port_teardown(struct dsa_port *dp)
 		}
 		break;
 	}
+
+	dp->setup = false;
 }
 
 static int dsa_switch_setup(struct dsa_switch *ds)
 {
-	int err = 0;
+	int err;
+
+	if (ds->setup)
+		return 0;
 
 	/* Initialize ds->phys_mii_mask before registering the slave MDIO bus
 	 * driver and before ops->setup() has run, since the switch drivers and
@@ -411,6 +426,8 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 			goto unregister_notifier;
 	}
 
+	ds->setup = true;
+
 	return 0;
 
 unregister_notifier:
@@ -426,6 +443,9 @@ free_devlink:
 
 static void dsa_switch_teardown(struct dsa_switch *ds)
 {
+	if (!ds->setup)
+		return;
+
 	if (ds->slave_mii_bus && ds->ops->phy_read)
 		mdiobus_unregister(ds->slave_mii_bus);
 
@@ -440,78 +460,47 @@ static void dsa_switch_teardown(struct dsa_switch *ds)
 		ds->devlink = NULL;
 	}
 
+	ds->setup = false;
 }
 
 static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
 {
-	struct dsa_switch *ds;
 	struct dsa_port *dp;
-	int device, port, i;
-	int err = 0;
-
-	for (device = 0; device < DSA_MAX_SWITCHES; device++) {
-		ds = dst->ds[device];
-		if (!ds)
-			continue;
+	int err;
 
-		err = dsa_switch_setup(ds);
+	list_for_each_entry(dp, &dst->ports, list) {
+		err = dsa_switch_setup(dp->ds);
 		if (err)
-			goto switch_teardown;
-
-		for (port = 0; port < ds->num_ports; port++) {
-			dp = &ds->ports[port];
+			goto teardown;
+	}
 
-			err = dsa_port_setup(dp);
-			if (err)
-				goto ports_teardown;
-		}
+	list_for_each_entry(dp, &dst->ports, list) {
+		err = dsa_port_setup(dp);
+		if (err)
+			goto teardown;
 	}
 
 	return 0;
 
-ports_teardown:
-	for (i = 0; i < port; i++)
-		dsa_port_teardown(&ds->ports[i]);
-
-	dsa_switch_teardown(ds);
-
-switch_teardown:
-	for (i = 0; i < device; i++) {
-		ds = dst->ds[i];
-		if (!ds)
-			continue;
-
-		for (port = 0; port < ds->num_ports; port++) {
-			dp = &ds->ports[port];
-
-			dsa_port_teardown(dp);
-		}
+teardown:
+	list_for_each_entry(dp, &dst->ports, list)
+		dsa_port_teardown(dp);
 
-		dsa_switch_teardown(ds);
-	}
+	list_for_each_entry(dp, &dst->ports, list)
+		dsa_switch_teardown(dp->ds);
 
 	return err;
 }
 
 static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst)
 {
-	struct dsa_switch *ds;
 	struct dsa_port *dp;
-	int device, port;
 
-	for (device = 0; device < DSA_MAX_SWITCHES; device++) {
-		ds = dst->ds[device];
-		if (!ds)
-			continue;
+	list_for_each_entry(dp, &dst->ports, list)
+		dsa_port_teardown(dp);
 
-		for (port = 0; port < ds->num_ports; port++) {
-			dp = &ds->ports[port];
-
-			dsa_port_teardown(dp);
-		}
-
-		dsa_switch_teardown(ds);
-	}
+	list_for_each_entry(dp, &dst->ports, list)
+		dsa_switch_teardown(dp->ds);
 }
 
 static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
-- 
cgit v1.2.3-59-g8ed1b


From da4561cda2ea6240fc61442eeb2acc47e2e0cae3 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Mon, 21 Oct 2019 16:51:24 -0400
Subject: net: dsa: use ports list to setup default CPU port

Use the new ports list instead of iterating over switches and their
ports when setting up the default CPU port. Unassign it on teardown.

Now that we can iterate over multiple CPU ports, remove dst->cpu_dp.

At the same time, provide a better error message for CPU-less tree.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/dsa.h |  5 -----
 net/dsa/dsa2.c    | 33 ++++++++++++---------------------
 2 files changed, 12 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index bd08bdee8341..f572134eb5de 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -120,11 +120,6 @@ struct dsa_switch_tree {
 	 */
 	struct dsa_platform_data	*pd;
 
-	/*
-	 * The switch port to which the CPU is attached.
-	 */
-	struct dsa_port		*cpu_dp;
-
 	/* List of switch ports */
 	struct list_head ports;
 
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 80191c7702a9..bf8b4e0fcb4f 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -197,38 +197,29 @@ static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
 
 static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
 {
-	struct dsa_switch *ds;
-	struct dsa_port *dp;
-	int device, port;
+	struct dsa_port *cpu_dp, *dp;
 
-	/* DSA currently only supports a single CPU port */
-	dst->cpu_dp = dsa_tree_find_first_cpu(dst);
-	if (!dst->cpu_dp) {
-		pr_warn("Tree has no master device\n");
+	cpu_dp = dsa_tree_find_first_cpu(dst);
+	if (!cpu_dp) {
+		pr_err("DSA: tree %d has no CPU port\n", dst->index);
 		return -EINVAL;
 	}
 
 	/* Assign the default CPU port to all ports of the fabric */
-	for (device = 0; device < DSA_MAX_SWITCHES; device++) {
-		ds = dst->ds[device];
-		if (!ds)
-			continue;
-
-		for (port = 0; port < ds->num_ports; port++) {
-			dp = &ds->ports[port];
-
-			if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
-				dp->cpu_dp = dst->cpu_dp;
-		}
-	}
+	list_for_each_entry(dp, &dst->ports, list)
+		if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+			dp->cpu_dp = cpu_dp;
 
 	return 0;
 }
 
 static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst)
 {
-	/* DSA currently only supports a single CPU port */
-	dst->cpu_dp = NULL;
+	struct dsa_port *dp;
+
+	list_for_each_entry(dp, &dst->ports, list)
+		if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+			dp->cpu_dp = NULL;
 }
 
 static int dsa_port_setup(struct dsa_port *dp)
-- 
cgit v1.2.3-59-g8ed1b


From 05f294a852358a46d9236cc777901f49a4f0ae85 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Mon, 21 Oct 2019 16:51:29 -0400
Subject: net: dsa: allocate ports on touch

Allocate the struct dsa_port the first time it is accessed with
dsa_port_touch, and remove the static dsa_port array from the
dsa_switch structure.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/dsa.h |  2 --
 net/dsa/dsa2.c    | 16 ++++++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index f572134eb5de..9bc1d3f71f89 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -277,9 +277,7 @@ struct dsa_switch {
 	 */
 	bool			vlan_filtering;
 
-	/* Dynamically allocated ports, keep last */
 	size_t num_ports;
-	struct dsa_port ports[];
 };
 
 static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index bf8b4e0fcb4f..83cba4623698 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -588,7 +588,13 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
 	struct dsa_switch_tree *dst = ds->dst;
 	struct dsa_port *dp;
 
-	dp = &ds->ports[index];
+	list_for_each_entry(dp, &dst->ports, list)
+		if (dp->ds == ds && dp->index == index)
+			return dp;
+
+	dp = kzalloc(sizeof(*dp), GFP_KERNEL);
+	if (!dp)
+		return NULL;
 
 	dp->ds = ds;
 	dp->index = index;
@@ -857,7 +863,7 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
 {
 	struct dsa_switch *ds;
 
-	ds = devm_kzalloc(dev, struct_size(ds, ports, n), GFP_KERNEL);
+	ds = devm_kzalloc(dev, sizeof(*ds), GFP_KERNEL);
 	if (!ds)
 		return NULL;
 
@@ -885,6 +891,12 @@ static void dsa_switch_remove(struct dsa_switch *ds)
 {
 	struct dsa_switch_tree *dst = ds->dst;
 	unsigned int index = ds->index;
+	struct dsa_port *dp, *next;
+
+	list_for_each_entry_safe(dp, next, &dst->ports, list) {
+		list_del(&dp->list);
+		kfree(dp);
+	}
 
 	dsa_tree_remove_switch(dst, index);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7e99e34701728d54ccd0466eccf377a42b9db215 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Mon, 21 Oct 2019 16:51:30 -0400
Subject: net: dsa: remove dsa_switch_alloc helper

Now that ports are dynamically listed in the fabric, there is no need
to provide a special helper to allocate the dsa_switch structure. This
will give more flexibility to drivers to embed this structure as they
wish in their private structure.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/dsa/b53/b53_common.c       |  5 ++++-
 drivers/net/dsa/dsa_loop.c             |  5 ++++-
 drivers/net/dsa/lan9303-core.c         |  4 +++-
 drivers/net/dsa/lantiq_gswip.c         |  4 +++-
 drivers/net/dsa/microchip/ksz_common.c |  5 ++++-
 drivers/net/dsa/mt7530.c               |  5 ++++-
 drivers/net/dsa/mv88e6060.c            |  4 +++-
 drivers/net/dsa/mv88e6xxx/chip.c       |  4 +++-
 drivers/net/dsa/qca8k.c                |  5 ++++-
 drivers/net/dsa/realtek-smi-core.c     |  5 ++++-
 drivers/net/dsa/sja1105/sja1105_main.c |  4 +++-
 drivers/net/dsa/vitesse-vsc73xx-core.c |  5 ++++-
 include/net/dsa.h                      |  1 -
 net/dsa/dsa2.c                         | 21 ++++++---------------
 14 files changed, 49 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index aef9b56781ef..baadf622ac55 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -2341,10 +2341,13 @@ struct b53_device *b53_switch_alloc(struct device *base,
 	struct dsa_switch *ds;
 	struct b53_device *dev;
 
-	ds = dsa_switch_alloc(base, DSA_MAX_PORTS);
+	ds = devm_kzalloc(base, sizeof(*ds), GFP_KERNEL);
 	if (!ds)
 		return NULL;
 
+	ds->dev = base;
+	ds->num_ports = DSA_MAX_PORTS;
+
 	dev = devm_kzalloc(base, sizeof(*dev), GFP_KERNEL);
 	if (!dev)
 		return NULL;
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index 925ed135a4d9..c8d7ef27fd72 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -286,10 +286,13 @@ static int dsa_loop_drv_probe(struct mdio_device *mdiodev)
 	dev_info(&mdiodev->dev, "%s: 0x%0x\n",
 		 pdata->name, pdata->enabled_ports);
 
-	ds = dsa_switch_alloc(&mdiodev->dev, DSA_MAX_PORTS);
+	ds = devm_kzalloc(&mdiodev->dev, sizeof(*ds), GFP_KERNEL);
 	if (!ds)
 		return -ENOMEM;
 
+	ds->dev = &mdiodev->dev;
+	ds->num_ports = DSA_MAX_PORTS;
+
 	ps = devm_kzalloc(&mdiodev->dev, sizeof(*ps), GFP_KERNEL);
 	if (!ps)
 		return -ENOMEM;
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index bbec86b9418e..e3c333a8f45d 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -1283,10 +1283,12 @@ static int lan9303_register_switch(struct lan9303 *chip)
 {
 	int base;
 
-	chip->ds = dsa_switch_alloc(chip->dev, LAN9303_NUM_PORTS);
+	chip->ds = devm_kzalloc(chip->dev, sizeof(*chip->ds), GFP_KERNEL);
 	if (!chip->ds)
 		return -ENOMEM;
 
+	chip->ds->dev = chip->dev;
+	chip->ds->num_ports = LAN9303_NUM_PORTS;
 	chip->ds->priv = chip;
 	chip->ds->ops = &lan9303_switch_ops;
 	base = chip->phy_addr_base;
diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c
index a69c9b9878b7..955324968b74 100644
--- a/drivers/net/dsa/lantiq_gswip.c
+++ b/drivers/net/dsa/lantiq_gswip.c
@@ -1854,10 +1854,12 @@ static int gswip_probe(struct platform_device *pdev)
 	if (!priv->hw_info)
 		return -EINVAL;
 
-	priv->ds = dsa_switch_alloc(dev, priv->hw_info->max_ports);
+	priv->ds = devm_kzalloc(dev, sizeof(*priv->ds), GFP_KERNEL);
 	if (!priv->ds)
 		return -ENOMEM;
 
+	priv->ds->dev = dev;
+	priv->ds->num_ports = priv->hw_info->max_ports;
 	priv->ds->priv = priv;
 	priv->ds->ops = &gswip_switch_ops;
 	priv->dev = dev;
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index fe47180c908b..5d08e4430824 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -398,10 +398,13 @@ struct ksz_device *ksz_switch_alloc(struct device *base, void *priv)
 	struct dsa_switch *ds;
 	struct ksz_device *swdev;
 
-	ds = dsa_switch_alloc(base, DSA_MAX_PORTS);
+	ds = devm_kzalloc(base, sizeof(*ds), GFP_KERNEL);
 	if (!ds)
 		return NULL;
 
+	ds->dev = base;
+	ds->num_ports = DSA_MAX_PORTS;
+
 	swdev = devm_kzalloc(base, sizeof(*swdev), GFP_KERNEL);
 	if (!swdev)
 		return NULL;
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index a91293e47a57..add9e4279176 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -1632,10 +1632,13 @@ mt7530_probe(struct mdio_device *mdiodev)
 	if (!priv)
 		return -ENOMEM;
 
-	priv->ds = dsa_switch_alloc(&mdiodev->dev, DSA_MAX_PORTS);
+	priv->ds = devm_kzalloc(&mdiodev->dev, sizeof(*priv->ds), GFP_KERNEL);
 	if (!priv->ds)
 		return -ENOMEM;
 
+	priv->ds->dev = &mdiodev->dev;
+	priv->ds->num_ports = DSA_MAX_PORTS;
+
 	/* Use medatek,mcm property to distinguish hardware type that would
 	 * casues a little bit differences on power-on sequence.
 	 */
diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index 2a2489b5196d..a5a37f47b320 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -270,10 +270,12 @@ static int mv88e6060_probe(struct mdio_device *mdiodev)
 
 	dev_info(dev, "switch %s detected\n", name);
 
-	ds = dsa_switch_alloc(dev, MV88E6060_PORTS);
+	ds = devm_kzalloc(dev, sizeof(*ds), GFP_KERNEL);
 	if (!ds)
 		return -ENOMEM;
 
+	ds->dev = dev;
+	ds->num_ports = MV88E6060_PORTS;
 	ds->priv = priv;
 	ds->dev = dev;
 	ds->ops = &mv88e6060_switch_ops;
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index c53d4dc88e90..5fdf6d6ebe27 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -4978,10 +4978,12 @@ static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip)
 	struct device *dev = chip->dev;
 	struct dsa_switch *ds;
 
-	ds = dsa_switch_alloc(dev, mv88e6xxx_num_ports(chip));
+	ds = devm_kzalloc(dev, sizeof(*ds), GFP_KERNEL);
 	if (!ds)
 		return -ENOMEM;
 
+	ds->dev = dev;
+	ds->num_ports = mv88e6xxx_num_ports(chip);
 	ds->priv = chip;
 	ds->dev = dev;
 	ds->ops = &mv88e6xxx_switch_ops;
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 71e44c8763b8..7e742cd491e8 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -1077,10 +1077,13 @@ qca8k_sw_probe(struct mdio_device *mdiodev)
 	if (id != QCA8K_ID_QCA8337)
 		return -ENODEV;
 
-	priv->ds = dsa_switch_alloc(&mdiodev->dev, QCA8K_NUM_PORTS);
+	priv->ds = devm_kzalloc(&mdiodev->dev, sizeof(*priv->ds),
+				QCA8K_NUM_PORTS);
 	if (!priv->ds)
 		return -ENOMEM;
 
+	priv->ds->dev = &mdiodev->dev;
+	priv->ds->num_ports = DSA_MAX_PORTS;
 	priv->ds->priv = priv;
 	priv->ops = qca8k_switch_ops;
 	priv->ds->ops = &priv->ops;
diff --git a/drivers/net/dsa/realtek-smi-core.c b/drivers/net/dsa/realtek-smi-core.c
index dc0509c02d29..fae188c60191 100644
--- a/drivers/net/dsa/realtek-smi-core.c
+++ b/drivers/net/dsa/realtek-smi-core.c
@@ -444,9 +444,12 @@ static int realtek_smi_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	smi->ds = dsa_switch_alloc(dev, smi->num_ports);
+	smi->ds = devm_kzalloc(dev, sizeof(*smi->ds), GFP_KERNEL);
 	if (!smi->ds)
 		return -ENOMEM;
+
+	smi->ds->dev = dev;
+	smi->ds->num_ports = smi->num_ports;
 	smi->ds->priv = smi;
 
 	smi->ds->ops = var->ds_ops;
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 0ebbda5ca665..2ae84a9dea59 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2047,10 +2047,12 @@ static int sja1105_probe(struct spi_device *spi)
 
 	dev_info(dev, "Probed switch chip: %s\n", priv->info->name);
 
-	ds = dsa_switch_alloc(dev, SJA1105_NUM_PORTS);
+	ds = devm_kzalloc(dev, sizeof(*ds), GFP_KERNEL);
 	if (!ds)
 		return -ENOMEM;
 
+	ds->dev = dev;
+	ds->num_ports = SJA1105_NUM_PORTS;
 	ds->ops = &sja1105_switch_ops;
 	ds->priv = priv;
 	priv->ds = ds;
diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c
index 614377ef7956..42c1574d45f2 100644
--- a/drivers/net/dsa/vitesse-vsc73xx-core.c
+++ b/drivers/net/dsa/vitesse-vsc73xx-core.c
@@ -1178,9 +1178,12 @@ int vsc73xx_probe(struct vsc73xx *vsc)
 	 * We allocate 8 ports and avoid access to the nonexistant
 	 * ports.
 	 */
-	vsc->ds = dsa_switch_alloc(dev, 8);
+	vsc->ds = devm_kzalloc(dev, sizeof(*vsc->ds), GFP_KERNEL);
 	if (!vsc->ds)
 		return -ENOMEM;
+
+	vsc->ds->dev = dev;
+	vsc->ds->num_ports = 8;
 	vsc->ds->priv = vsc;
 
 	vsc->ds->ops = &vsc73xx_ds_ops;
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 9bc1d3f71f89..e3c14dc3bab9 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -577,7 +577,6 @@ static inline bool dsa_can_decode(const struct sk_buff *skb,
 	return false;
 }
 
-struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n);
 void dsa_unregister_switch(struct dsa_switch *ds);
 int dsa_register_switch(struct dsa_switch *ds);
 #ifdef CONFIG_PM_SLEEP
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 83cba4623698..1e3ac9b56c89 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -846,6 +846,12 @@ static int dsa_switch_probe(struct dsa_switch *ds)
 	struct device_node *np = ds->dev->of_node;
 	int err;
 
+	if (!ds->dev)
+		return -ENODEV;
+
+	if (!ds->num_ports)
+		return -EINVAL;
+
 	if (np)
 		err = dsa_switch_parse_of(ds, np);
 	else if (pdata)
@@ -859,21 +865,6 @@ static int dsa_switch_probe(struct dsa_switch *ds)
 	return dsa_switch_add(ds);
 }
 
-struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
-{
-	struct dsa_switch *ds;
-
-	ds = devm_kzalloc(dev, sizeof(*ds), GFP_KERNEL);
-	if (!ds)
-		return NULL;
-
-	ds->dev = dev;
-	ds->num_ports = n;
-
-	return ds;
-}
-EXPORT_SYMBOL_GPL(dsa_switch_alloc);
-
 int dsa_register_switch(struct dsa_switch *ds)
 {
 	int err;
-- 
cgit v1.2.3-59-g8ed1b


From 5e5b03d163e15a40b0fa57c70b4e8edd549b0b98 Mon Sep 17 00:00:00 2001
From: "Ben Dooks (Codethink)" <ben.dooks@codethink.co.uk>
Date: Tue, 22 Oct 2019 13:59:25 +0100
Subject: xdp: Fix type of string pointer in __XDP_ACT_SYM_TAB

The table entry in __XDP_ACT_SYM_TAB for the last item is set
to { -1, 0 } where it should be { -1, NULL } as the second item
is a pointer to a string.

Fixes the following sparse warnings:

./include/trace/events/xdp.h:28:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:53:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:82:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:140:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:155:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:190:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:225:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:260:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:318:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:356:1: warning: Using plain integer as NULL pointer
./include/trace/events/xdp.h:390:1: warning: Using plain integer as NULL pointer

Signed-off-by: Ben Dooks (Codethink) <ben.dooks@codethink.co.uk>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20191022125925.10508-1-ben.dooks@codethink.co.uk
---
 include/trace/events/xdp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 8c8420230a10..c7e3c9c5bad3 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -22,7 +22,7 @@
 #define __XDP_ACT_SYM_FN(x)	\
 	{ XDP_##x, #x },
 #define __XDP_ACT_SYM_TAB	\
-	__XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, 0 }
+	__XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL }
 __XDP_ACT_MAP(__XDP_ACT_TP_FN)
 
 TRACE_EVENT(xdp_exception,
-- 
cgit v1.2.3-59-g8ed1b


From 71a8a63b9dbdeba8205a37979b81d4fba499d079 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 16 Oct 2019 14:23:55 +0200
Subject: netfilter: nf_flow_table: move priority to struct nf_flowtable

Hardware offload needs access to the priority field, store this field in
the nf_flowtable object.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  1 +
 include/net/netfilter/nf_tables.h     |  2 --
 net/netfilter/nf_tables_api.c         | 10 +++++-----
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index b37a7d608134..158514281a75 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -24,6 +24,7 @@ struct nf_flowtable_type {
 struct nf_flowtable {
 	struct list_head		list;
 	struct rhashtable		rhashtable;
+	int				priority;
 	const struct nf_flowtable_type	*type;
 	struct delayed_work		gc_work;
 };
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 001d294edf57..d529dfb5aa64 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1155,7 +1155,6 @@ void nft_unregister_obj(struct nft_object_type *obj_type);
  * 	@table: the table the flow table is contained in
  *	@name: name of this flow table
  *	@hooknum: hook number
- *	@priority: hook priority
  *	@ops_len: number of hooks in array
  *	@genmask: generation mask
  *	@use: number of references to this flow table
@@ -1169,7 +1168,6 @@ struct nft_flowtable {
 	struct nft_table		*table;
 	char				*name;
 	int				hooknum;
-	int				priority;
 	int				ops_len;
 	u32				genmask:2,
 					use:30;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d481f9baca2f..bfea0d6effc5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5706,10 +5706,10 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 	if (!ops)
 		return -ENOMEM;
 
-	flowtable->hooknum	= hooknum;
-	flowtable->priority	= priority;
-	flowtable->ops		= ops;
-	flowtable->ops_len	= n;
+	flowtable->hooknum		= hooknum;
+	flowtable->data.priority	= priority;
+	flowtable->ops			= ops;
+	flowtable->ops_len		= n;
 
 	for (i = 0; i < n; i++) {
 		flowtable->ops[i].pf		= NFPROTO_NETDEV;
@@ -5969,7 +5969,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 	if (!nest)
 		goto nla_put_failure;
 	if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) ||
-	    nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->priority)))
+	    nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority)))
 		goto nla_put_failure;
 
 	nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS);
-- 
cgit v1.2.3-59-g8ed1b


From 3f0465a9ef02624e0a36db9e7c9bedcafcd6f6fe Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 16 Oct 2019 14:24:01 +0200
Subject: netfilter: nf_tables: dynamically allocate hooks per net_device in
 flowtables

Use a list of hooks per device instead an array.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   8 +-
 net/netfilter/nf_tables_api.c     | 253 +++++++++++++++++++++++---------------
 2 files changed, 158 insertions(+), 103 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d529dfb5aa64..7a2ac82ee0ad 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -963,6 +963,12 @@ struct nft_stats {
 	struct u64_stats_sync	syncp;
 };
 
+struct nft_hook {
+	struct list_head	list;
+	struct nf_hook_ops	ops;
+	struct rcu_head		rcu;
+};
+
 /**
  *	struct nft_base_chain - nf_tables base chain
  *
@@ -1173,7 +1179,7 @@ struct nft_flowtable {
 					use:30;
 	u64				handle;
 	/* runtime data below here */
-	struct nf_hook_ops		*ops ____cacheline_aligned;
+	struct list_head		hook_list ____cacheline_aligned;
 	struct nf_flowtable		data;
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bfea0d6effc5..d6224c7b0e28 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1508,6 +1508,76 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 	}
 }
 
+static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
+					      const struct nlattr *attr)
+{
+	struct net_device *dev;
+	char ifname[IFNAMSIZ];
+	struct nft_hook *hook;
+	int err;
+
+	hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL);
+	if (!hook) {
+		err = -ENOMEM;
+		goto err_hook_alloc;
+	}
+
+	nla_strlcpy(ifname, attr, IFNAMSIZ);
+	dev = __dev_get_by_name(net, ifname);
+	if (!dev) {
+		err = -ENOENT;
+		goto err_hook_dev;
+	}
+	hook->ops.dev = dev;
+
+	return hook;
+
+err_hook_dev:
+	kfree(hook);
+err_hook_alloc:
+	return ERR_PTR(err);
+}
+
+static int nf_tables_parse_netdev_hooks(struct net *net,
+					const struct nlattr *attr,
+					struct list_head *hook_list)
+{
+	struct nft_hook *hook, *next;
+	const struct nlattr *tmp;
+	int rem, n = 0, err;
+
+	nla_for_each_nested(tmp, attr, rem) {
+		if (nla_type(tmp) != NFTA_DEVICE_NAME) {
+			err = -EINVAL;
+			goto err_hook;
+		}
+
+		hook = nft_netdev_hook_alloc(net, tmp);
+		if (IS_ERR(hook)) {
+			err = PTR_ERR(hook);
+			goto err_hook;
+		}
+		list_add_tail(&hook->list, hook_list);
+		n++;
+
+		if (n == NFT_FLOWTABLE_DEVICE_MAX) {
+			err = -EFBIG;
+			goto err_hook;
+		}
+	}
+	if (!n)
+		return -EINVAL;
+
+	return 0;
+
+err_hook:
+	list_for_each_entry_safe(hook, next, hook_list, list) {
+		list_del(&hook->list);
+		kfree(hook);
+	}
+	return err;
+}
+
 struct nft_chain_hook {
 	u32				num;
 	s32				priority;
@@ -5628,43 +5698,6 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table,
        return ERR_PTR(-ENOENT);
 }
 
-static int nf_tables_parse_devices(const struct nft_ctx *ctx,
-				   const struct nlattr *attr,
-				   struct net_device *dev_array[], int *len)
-{
-	const struct nlattr *tmp;
-	struct net_device *dev;
-	char ifname[IFNAMSIZ];
-	int rem, n = 0, err;
-
-	nla_for_each_nested(tmp, attr, rem) {
-		if (nla_type(tmp) != NFTA_DEVICE_NAME) {
-			err = -EINVAL;
-			goto err1;
-		}
-
-		nla_strlcpy(ifname, tmp, IFNAMSIZ);
-		dev = __dev_get_by_name(ctx->net, ifname);
-		if (!dev) {
-			err = -ENOENT;
-			goto err1;
-		}
-
-		dev_array[n++] = dev;
-		if (n == NFT_FLOWTABLE_DEVICE_MAX) {
-			err = -EFBIG;
-			goto err1;
-		}
-	}
-	if (!len)
-		return -EINVAL;
-
-	err = 0;
-err1:
-	*len = n;
-	return err;
-}
-
 static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
 	[NFTA_FLOWTABLE_HOOK_NUM]	= { .type = NLA_U32 },
 	[NFTA_FLOWTABLE_HOOK_PRIORITY]	= { .type = NLA_U32 },
@@ -5675,11 +5708,10 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 					  const struct nlattr *attr,
 					  struct nft_flowtable *flowtable)
 {
-	struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX];
 	struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
-	struct nf_hook_ops *ops;
+	struct nft_hook *hook;
 	int hooknum, priority;
-	int err, n = 0, i;
+	int err;
 
 	err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr,
 					  nft_flowtable_hook_policy, NULL);
@@ -5697,27 +5729,21 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 
 	priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
 
-	err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS],
-				      dev_array, &n);
+	err = nf_tables_parse_netdev_hooks(ctx->net,
+					   tb[NFTA_FLOWTABLE_HOOK_DEVS],
+					   &flowtable->hook_list);
 	if (err < 0)
 		return err;
 
-	ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL);
-	if (!ops)
-		return -ENOMEM;
-
 	flowtable->hooknum		= hooknum;
 	flowtable->data.priority	= priority;
-	flowtable->ops			= ops;
-	flowtable->ops_len		= n;
 
-	for (i = 0; i < n; i++) {
-		flowtable->ops[i].pf		= NFPROTO_NETDEV;
-		flowtable->ops[i].hooknum	= hooknum;
-		flowtable->ops[i].priority	= priority;
-		flowtable->ops[i].priv		= &flowtable->data;
-		flowtable->ops[i].hook		= flowtable->data.type->hook;
-		flowtable->ops[i].dev		= dev_array[i];
+	list_for_each_entry(hook, &flowtable->hook_list, list) {
+		hook->ops.pf		= NFPROTO_NETDEV;
+		hook->ops.hooknum	= hooknum;
+		hook->ops.priority	= priority;
+		hook->ops.priv		= &flowtable->data;
+		hook->ops.hook		= flowtable->data.type->hook;
 	}
 
 	return err;
@@ -5757,14 +5783,51 @@ nft_flowtable_type_get(struct net *net, u8 family)
 static void nft_unregister_flowtable_net_hooks(struct net *net,
 					       struct nft_flowtable *flowtable)
 {
-	int i;
+	struct nft_hook *hook;
 
-	for (i = 0; i < flowtable->ops_len; i++) {
-		if (!flowtable->ops[i].dev)
-			continue;
+	list_for_each_entry(hook, &flowtable->hook_list, list)
+		nf_unregister_net_hook(net, &hook->ops);
+}
+
+static int nft_register_flowtable_net_hooks(struct net *net,
+					    struct nft_table *table,
+					    struct nft_flowtable *flowtable)
+{
+	struct nft_hook *hook, *hook2, *next;
+	struct nft_flowtable *ft;
+	int err, i = 0;
+
+	list_for_each_entry(hook, &flowtable->hook_list, list) {
+		list_for_each_entry(ft, &table->flowtables, list) {
+			list_for_each_entry(hook2, &ft->hook_list, list) {
+				if (hook->ops.dev == hook2->ops.dev &&
+				    hook->ops.pf == hook2->ops.pf) {
+					err = -EBUSY;
+					goto err_unregister_net_hooks;
+				}
+			}
+		}
 
-		nf_unregister_net_hook(net, &flowtable->ops[i]);
+		err = nf_register_net_hook(net, &hook->ops);
+		if (err < 0)
+			goto err_unregister_net_hooks;
+
+		i++;
 	}
+
+	return 0;
+
+err_unregister_net_hooks:
+	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+		if (i-- <= 0)
+			break;
+
+		nf_unregister_net_hook(net, &hook->ops);
+		list_del_rcu(&hook->list);
+		kfree_rcu(hook, rcu);
+	}
+
+	return err;
 }
 
 static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
@@ -5775,12 +5838,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nf_flowtable_type *type;
-	struct nft_flowtable *flowtable, *ft;
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
+	struct nft_flowtable *flowtable;
+	struct nft_hook *hook, *next;
 	struct nft_table *table;
 	struct nft_ctx ctx;
-	int err, i, k;
+	int err;
 
 	if (!nla[NFTA_FLOWTABLE_TABLE] ||
 	    !nla[NFTA_FLOWTABLE_NAME] ||
@@ -5819,6 +5883,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 
 	flowtable->table = table;
 	flowtable->handle = nf_tables_alloc_handle(table);
+	INIT_LIST_HEAD(&flowtable->hook_list);
 
 	flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
 	if (!flowtable->name) {
@@ -5842,43 +5907,24 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		goto err4;
 
-	for (i = 0; i < flowtable->ops_len; i++) {
-		if (!flowtable->ops[i].dev)
-			continue;
-
-		list_for_each_entry(ft, &table->flowtables, list) {
-			for (k = 0; k < ft->ops_len; k++) {
-				if (!ft->ops[k].dev)
-					continue;
-
-				if (flowtable->ops[i].dev == ft->ops[k].dev &&
-				    flowtable->ops[i].pf == ft->ops[k].pf) {
-					err = -EBUSY;
-					goto err5;
-				}
-			}
-		}
-
-		err = nf_register_net_hook(net, &flowtable->ops[i]);
-		if (err < 0)
-			goto err5;
-	}
+	err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable);
+	if (err < 0)
+		goto err4;
 
 	err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
 	if (err < 0)
-		goto err6;
+		goto err5;
 
 	list_add_tail_rcu(&flowtable->list, &table->flowtables);
 	table->use++;
 
 	return 0;
-err6:
-	i = flowtable->ops_len;
 err5:
-	for (k = i - 1; k >= 0; k--)
-		nf_unregister_net_hook(net, &flowtable->ops[k]);
-
-	kfree(flowtable->ops);
+	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+		nf_unregister_net_hook(net, &hook->ops);
+		list_del_rcu(&hook->list);
+		kfree_rcu(hook, rcu);
+	}
 err4:
 	flowtable->data.type->free(&flowtable->data);
 err3:
@@ -5945,8 +5991,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 {
 	struct nlattr *nest, *nest_devs;
 	struct nfgenmsg *nfmsg;
+	struct nft_hook *hook;
 	struct nlmsghdr *nlh;
-	int i;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
@@ -5976,11 +6022,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 	if (!nest_devs)
 		goto nla_put_failure;
 
-	for (i = 0; i < flowtable->ops_len; i++) {
-		const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev);
-
-		if (dev &&
-		    nla_put_string(skb, NFTA_DEVICE_NAME, dev->name))
+	list_for_each_entry_rcu(hook, &flowtable->hook_list, list) {
+		if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name))
 			goto nla_put_failure;
 	}
 	nla_nest_end(skb, nest_devs);
@@ -6171,7 +6214,12 @@ err:
 
 static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 {
-	kfree(flowtable->ops);
+	struct nft_hook *hook, *next;
+
+	list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+		list_del_rcu(&hook->list);
+		kfree(hook);
+	}
 	kfree(flowtable->name);
 	flowtable->data.type->free(&flowtable->data);
 	module_put(flowtable->data.type->owner);
@@ -6211,14 +6259,15 @@ nla_put_failure:
 static void nft_flowtable_event(unsigned long event, struct net_device *dev,
 				struct nft_flowtable *flowtable)
 {
-	int i;
+	struct nft_hook *hook;
 
-	for (i = 0; i < flowtable->ops_len; i++) {
-		if (flowtable->ops[i].dev != dev)
+	list_for_each_entry(hook, &flowtable->hook_list, list) {
+		if (hook->ops.dev != dev)
 			continue;
 
-		nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]);
-		flowtable->ops[i].dev = NULL;
+		nf_unregister_net_hook(dev_net(dev), &hook->ops);
+		list_del_rcu(&hook->list);
+		kfree_rcu(hook, rcu);
 		break;
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From cb662ac6711f7135618526221498ebfae155531a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 16 Oct 2019 14:29:47 +0200
Subject: netfilter: nf_tables: increase maximum devices number per flowtable

Rise the maximum limit of devices per flowtable up to 256. Rename
NFT_FLOWTABLE_DEVICE_MAX to NFT_NETDEVICE_MAX in preparation to reuse
the netdev hook parser for ingress basechain.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 2 +-
 net/netfilter/nf_tables_api.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 7a2ac82ee0ad..3d71070e747a 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1152,7 +1152,7 @@ struct nft_object_ops {
 int nft_register_obj(struct nft_object_type *obj_type);
 void nft_unregister_obj(struct nft_object_type *obj_type);
 
-#define NFT_FLOWTABLE_DEVICE_MAX	8
+#define NFT_NETDEVICE_MAX	256
 
 /**
  *	struct nft_flowtable - nf_tables flow table
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2664bc388db4..98169af56c0f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1577,7 +1577,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net,
 		list_add_tail(&hook->list, hook_list);
 		n++;
 
-		if (n == NFT_FLOWTABLE_DEVICE_MAX) {
+		if (n == NFT_NETDEVICE_MAX) {
 			err = -EFBIG;
 			goto err_hook;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From d54725cd11a57c30f650260cfb0a92c268bdc3e0 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 16 Oct 2019 14:30:05 +0200
Subject: netfilter: nf_tables: support for multiple devices per netdev hook

This patch allows you to register one netdev basechain to multiple
devices. This adds a new NFTA_HOOK_DEVS netlink attribute to specify
the list of netdevices. Basechains store a list of hooks.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |   4 +-
 include/uapi/linux/netfilter/nf_tables.h |   2 +
 net/netfilter/nf_tables_api.c            | 296 ++++++++++++++++++++++++-------
 net/netfilter/nf_tables_offload.c        |  44 +++--
 net/netfilter/nft_chain_filter.c         |  45 +++--
 5 files changed, 293 insertions(+), 98 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3d71070e747a..5bf569e1173b 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -973,21 +973,21 @@ struct nft_hook {
  *	struct nft_base_chain - nf_tables base chain
  *
  *	@ops: netfilter hook ops
+ *	@hook_list: list of netfilter hooks (for NFPROTO_NETDEV family)
  *	@type: chain type
  *	@policy: default policy
  *	@stats: per-cpu chain stats
  *	@chain: the chain
- *	@dev_name: device name that this base chain is attached to (if any)
  *	@flow_block: flow block (for hardware offload)
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops;
+	struct list_head		hook_list;
 	const struct nft_chain_type	*type;
 	u8				policy;
 	u8				flags;
 	struct nft_stats __percpu	*stats;
 	struct nft_chain		chain;
-	char 				dev_name[IFNAMSIZ];
 	struct flow_block		flow_block;
 };
 
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ed8881ad18ed..81fed16fe2b2 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -144,12 +144,14 @@ enum nft_list_attributes {
  * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32)
  * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32)
  * @NFTA_HOOK_DEV: netdevice name (NLA_STRING)
+ * @NFTA_HOOK_DEVS: list of netdevices (NLA_NESTED)
  */
 enum nft_hook_attributes {
 	NFTA_HOOK_UNSPEC,
 	NFTA_HOOK_HOOKNUM,
 	NFTA_HOOK_PRIORITY,
 	NFTA_HOOK_DEV,
+	NFTA_HOOK_DEVS,
 	__NFTA_HOOK_MAX
 };
 #define NFTA_HOOK_MAX		(__NFTA_HOOK_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 98169af56c0f..13f09412cc6a 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -151,11 +151,64 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
 	}
 }
 
+static int nft_netdev_register_hooks(struct net *net,
+				     struct list_head *hook_list)
+{
+	struct nft_hook *hook;
+	int err, j;
+
+	j = 0;
+	list_for_each_entry(hook, hook_list, list) {
+		err = nf_register_net_hook(net, &hook->ops);
+		if (err < 0)
+			goto err_register;
+
+		j++;
+	}
+	return 0;
+
+err_register:
+	list_for_each_entry(hook, hook_list, list) {
+		if (j-- <= 0)
+			break;
+
+		nf_unregister_net_hook(net, &hook->ops);
+	}
+	return err;
+}
+
+static void nft_netdev_unregister_hooks(struct net *net,
+					struct list_head *hook_list)
+{
+	struct nft_hook *hook;
+
+	list_for_each_entry(hook, hook_list, list)
+		nf_unregister_net_hook(net, &hook->ops);
+}
+
+static int nft_register_basechain_hooks(struct net *net, int family,
+					struct nft_base_chain *basechain)
+{
+	if (family == NFPROTO_NETDEV)
+		return nft_netdev_register_hooks(net, &basechain->hook_list);
+
+	return nf_register_net_hook(net, &basechain->ops);
+}
+
+static void nft_unregister_basechain_hooks(struct net *net, int family,
+					   struct nft_base_chain *basechain)
+{
+	if (family == NFPROTO_NETDEV)
+		nft_netdev_unregister_hooks(net, &basechain->hook_list);
+	else
+		nf_unregister_net_hook(net, &basechain->ops);
+}
+
 static int nf_tables_register_hook(struct net *net,
 				   const struct nft_table *table,
 				   struct nft_chain *chain)
 {
-	const struct nft_base_chain *basechain;
+	struct nft_base_chain *basechain;
 	const struct nf_hook_ops *ops;
 
 	if (table->flags & NFT_TABLE_F_DORMANT ||
@@ -168,14 +221,14 @@ static int nf_tables_register_hook(struct net *net,
 	if (basechain->type->ops_register)
 		return basechain->type->ops_register(net, ops);
 
-	return nf_register_net_hook(net, ops);
+	return nft_register_basechain_hooks(net, table->family, basechain);
 }
 
 static void nf_tables_unregister_hook(struct net *net,
 				      const struct nft_table *table,
 				      struct nft_chain *chain)
 {
-	const struct nft_base_chain *basechain;
+	struct nft_base_chain *basechain;
 	const struct nf_hook_ops *ops;
 
 	if (table->flags & NFT_TABLE_F_DORMANT ||
@@ -187,7 +240,7 @@ static void nf_tables_unregister_hook(struct net *net,
 	if (basechain->type->ops_unregister)
 		return basechain->type->ops_unregister(net, ops);
 
-	nf_unregister_net_hook(net, ops);
+	nft_unregister_basechain_hooks(net, table->family, basechain);
 }
 
 static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
@@ -742,7 +795,8 @@ static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt)
 		if (cnt && i++ == cnt)
 			break;
 
-		nf_unregister_net_hook(net, &nft_base_chain(chain)->ops);
+		nft_unregister_basechain_hooks(net, table->family,
+					       nft_base_chain(chain));
 	}
 }
 
@@ -757,14 +811,16 @@ static int nf_tables_table_enable(struct net *net, struct nft_table *table)
 		if (!nft_is_base_chain(chain))
 			continue;
 
-		err = nf_register_net_hook(net, &nft_base_chain(chain)->ops);
+		err = nft_register_basechain_hooks(net, table->family,
+						   nft_base_chain(chain));
 		if (err < 0)
-			goto err;
+			goto err_register_hooks;
 
 		i++;
 	}
 	return 0;
-err:
+
+err_register_hooks:
 	if (i)
 		nft_table_disable(net, table, i);
 	return err;
@@ -1225,6 +1281,46 @@ nla_put_failure:
 	return -ENOSPC;
 }
 
+static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
+				   const struct nft_base_chain *basechain)
+{
+	const struct nf_hook_ops *ops = &basechain->ops;
+	struct nft_hook *hook, *first = NULL;
+	struct nlattr *nest, *nest_devs;
+	int n = 0;
+
+	nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK);
+	if (nest == NULL)
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
+		goto nla_put_failure;
+
+	if (family == NFPROTO_NETDEV) {
+		nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
+		list_for_each_entry(hook, &basechain->hook_list, list) {
+			if (!first)
+				first = hook;
+
+			if (nla_put_string(skb, NFTA_DEVICE_NAME,
+					   hook->ops.dev->name))
+				goto nla_put_failure;
+			n++;
+		}
+		nla_nest_end(skb, nest_devs);
+
+		if (n == 1 &&
+		    nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name))
+			goto nla_put_failure;
+	}
+	nla_nest_end(skb, nest);
+
+	return 0;
+nla_put_failure:
+	return -1;
+}
+
 static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 				     u32 portid, u32 seq, int event, u32 flags,
 				     int family, const struct nft_table *table,
@@ -1253,21 +1349,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 
 	if (nft_is_base_chain(chain)) {
 		const struct nft_base_chain *basechain = nft_base_chain(chain);
-		const struct nf_hook_ops *ops = &basechain->ops;
 		struct nft_stats __percpu *stats;
-		struct nlattr *nest;
 
-		nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK);
-		if (nest == NULL)
+		if (nft_dump_basechain_hook(skb, family, basechain))
 			goto nla_put_failure;
-		if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
-			goto nla_put_failure;
-		if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
-			goto nla_put_failure;
-		if (basechain->dev_name[0] &&
-		    nla_put_string(skb, NFTA_HOOK_DEV, basechain->dev_name))
-			goto nla_put_failure;
-		nla_nest_end(skb, nest);
 
 		if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
 				 htonl(basechain->policy)))
@@ -1485,6 +1570,7 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
 static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 {
 	struct nft_chain *chain = ctx->chain;
+	struct nft_hook *hook, *next;
 
 	if (WARN_ON(chain->use > 0))
 		return;
@@ -1495,6 +1581,13 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 	if (nft_is_base_chain(chain)) {
 		struct nft_base_chain *basechain = nft_base_chain(chain);
 
+		if (ctx->family == NFPROTO_NETDEV) {
+			list_for_each_entry_safe(hook, next,
+						 &basechain->hook_list, list) {
+				list_del_rcu(&hook->list);
+				kfree_rcu(hook, rcu);
+			}
+		}
 		module_put(basechain->type->owner);
 		if (rcu_access_pointer(basechain->stats)) {
 			static_branch_dec(&nft_counters_enabled);
@@ -1599,9 +1692,34 @@ struct nft_chain_hook {
 	u32				num;
 	s32				priority;
 	const struct nft_chain_type	*type;
-	struct net_device		*dev;
+	struct list_head		list;
 };
 
+static int nft_chain_parse_netdev(struct net *net,
+				  struct nlattr *tb[],
+				  struct list_head *hook_list)
+{
+	struct nft_hook *hook;
+	int err;
+
+	if (tb[NFTA_HOOK_DEV]) {
+		hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]);
+		if (IS_ERR(hook))
+			return PTR_ERR(hook);
+
+		list_add_tail(&hook->list, hook_list);
+	} else if (tb[NFTA_HOOK_DEVS]) {
+		err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS],
+						   hook_list);
+		if (err < 0)
+			return err;
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int nft_chain_parse_hook(struct net *net,
 				const struct nlattr * const nla[],
 				struct nft_chain_hook *hook, u8 family,
@@ -1609,7 +1727,6 @@ static int nft_chain_parse_hook(struct net *net,
 {
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
 	const struct nft_chain_type *type;
-	struct net_device *dev;
 	int err;
 
 	lockdep_assert_held(&net->nft.commit_mutex);
@@ -1647,23 +1764,14 @@ static int nft_chain_parse_hook(struct net *net,
 
 	hook->type = type;
 
-	hook->dev = NULL;
+	INIT_LIST_HEAD(&hook->list);
 	if (family == NFPROTO_NETDEV) {
-		char ifname[IFNAMSIZ];
-
-		if (!ha[NFTA_HOOK_DEV]) {
-			module_put(type->owner);
-			return -EOPNOTSUPP;
-		}
-
-		nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ);
-		dev = __dev_get_by_name(net, ifname);
-		if (!dev) {
+		err = nft_chain_parse_netdev(net, ha, &hook->list);
+		if (err < 0) {
 			module_put(type->owner);
-			return -ENOENT;
+			return err;
 		}
-		hook->dev = dev;
-	} else if (ha[NFTA_HOOK_DEV]) {
+	} else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) {
 		module_put(type->owner);
 		return -EOPNOTSUPP;
 	}
@@ -1673,6 +1781,12 @@ static int nft_chain_parse_hook(struct net *net,
 
 static void nft_chain_release_hook(struct nft_chain_hook *hook)
 {
+	struct nft_hook *h, *next;
+
+	list_for_each_entry_safe(h, next, &hook->list, list) {
+		list_del(&h->list);
+		kfree(h);
+	}
 	module_put(hook->type->owner);
 }
 
@@ -1697,6 +1811,49 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha
 	return kvmalloc(alloc, GFP_KERNEL);
 }
 
+static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family,
+				    const struct nft_chain_hook *hook,
+				    struct nft_chain *chain)
+{
+	ops->pf		= family;
+	ops->hooknum	= hook->num;
+	ops->priority	= hook->priority;
+	ops->priv	= chain;
+	ops->hook	= hook->type->hooks[ops->hooknum];
+}
+
+static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
+			      struct nft_chain_hook *hook, u32 flags)
+{
+	struct nft_chain *chain;
+	struct nft_hook *h;
+
+	basechain->type = hook->type;
+	INIT_LIST_HEAD(&basechain->hook_list);
+	chain = &basechain->chain;
+
+	if (family == NFPROTO_NETDEV) {
+		list_splice_init(&hook->list, &basechain->hook_list);
+		list_for_each_entry(h, &basechain->hook_list, list)
+			nft_basechain_hook_init(&h->ops, family, hook, chain);
+
+		basechain->ops.hooknum	= hook->num;
+		basechain->ops.priority	= hook->priority;
+	} else {
+		nft_basechain_hook_init(&basechain->ops, family, hook, chain);
+	}
+
+	chain->flags |= NFT_BASE_CHAIN | flags;
+	basechain->policy = NF_ACCEPT;
+	if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
+	    nft_chain_offload_priority(basechain) < 0)
+		return -EOPNOTSUPP;
+
+	flow_block_init(&basechain->flow_block);
+
+	return 0;
+}
+
 static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 			      u8 policy, u32 flags)
 {
@@ -1715,7 +1872,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 
 	if (nla[NFTA_CHAIN_HOOK]) {
 		struct nft_chain_hook hook;
-		struct nf_hook_ops *ops;
 
 		err = nft_chain_parse_hook(net, nla, &hook, family, true);
 		if (err < 0)
@@ -1726,9 +1882,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 			nft_chain_release_hook(&hook);
 			return -ENOMEM;
 		}
-
-		if (hook.dev != NULL)
-			strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ);
+		chain = &basechain->chain;
 
 		if (nla[NFTA_CHAIN_COUNTERS]) {
 			stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
@@ -1741,24 +1895,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 			static_branch_inc(&nft_counters_enabled);
 		}
 
-		basechain->type = hook.type;
-		chain = &basechain->chain;
-
-		ops		= &basechain->ops;
-		ops->pf		= family;
-		ops->hooknum	= hook.num;
-		ops->priority	= hook.priority;
-		ops->priv	= chain;
-		ops->hook	= hook.type->hooks[ops->hooknum];
-		ops->dev	= hook.dev;
-
-		chain->flags |= NFT_BASE_CHAIN | flags;
-		basechain->policy = NF_ACCEPT;
-		if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
-		    nft_chain_offload_priority(basechain) < 0)
-			return -EOPNOTSUPP;
-
-		flow_block_init(&basechain->flow_block);
+		err = nft_basechain_init(basechain, family, &hook, flags);
+		if (err < 0) {
+			nft_chain_release_hook(&hook);
+			kfree(basechain);
+			return err;
+		}
 	} else {
 		chain = kzalloc(sizeof(*chain), GFP_KERNEL);
 		if (chain == NULL)
@@ -1818,6 +1960,25 @@ err1:
 	return err;
 }
 
+static bool nft_hook_list_equal(struct list_head *hook_list1,
+				struct list_head *hook_list2)
+{
+	struct nft_hook *hook;
+	int n = 0, m = 0;
+
+	n = 0;
+	list_for_each_entry(hook, hook_list2, list) {
+		if (!nft_hook_list_find(hook_list1, hook))
+			return false;
+
+		n++;
+	}
+	list_for_each_entry(hook, hook_list1, list)
+		m++;
+
+	return n == m;
+}
+
 static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			      u32 flags)
 {
@@ -1849,12 +2010,19 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			return -EBUSY;
 		}
 
-		ops = &basechain->ops;
-		if (ops->hooknum != hook.num ||
-		    ops->priority != hook.priority ||
-		    ops->dev != hook.dev) {
-			nft_chain_release_hook(&hook);
-			return -EBUSY;
+		if (ctx->family == NFPROTO_NETDEV) {
+			if (!nft_hook_list_equal(&basechain->hook_list,
+						 &hook.list)) {
+				nft_chain_release_hook(&hook);
+				return -EBUSY;
+			}
+		} else {
+			ops = &basechain->ops;
+			if (ops->hooknum != hook.num ||
+			    ops->priority != hook.priority) {
+				nft_chain_release_hook(&hook);
+				return -EBUSY;
+			}
 		}
 		nft_chain_release_hook(&hook);
 	}
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index e7f32a9dad63..beeb74f2b47d 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -317,38 +317,47 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain,
 #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK
 
 static int nft_flow_block_chain(struct nft_base_chain *basechain,
-				struct net_device *dev,
+				const struct net_device *this_dev,
 				enum flow_block_command cmd)
 {
-	if (dev->netdev_ops->ndo_setup_tc)
-		return nft_block_offload_cmd(basechain, dev, cmd);
+	struct net_device *dev;
+	struct nft_hook *hook;
+	int err;
+
+	list_for_each_entry(hook, &basechain->hook_list, list) {
+		dev = hook->ops.dev;
+		if (this_dev && this_dev != dev)
+			continue;
 
-	return nft_indr_block_offload_cmd(basechain, dev, cmd);
+		if (dev->netdev_ops->ndo_setup_tc)
+			err = nft_block_offload_cmd(basechain, dev, cmd);
+		else
+			err = nft_indr_block_offload_cmd(basechain, dev, cmd);
+
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
 }
 
-static int nft_flow_offload_chain(struct nft_chain *chain,
-				  u8 *ppolicy,
+static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy,
 				  enum flow_block_command cmd)
 {
 	struct nft_base_chain *basechain;
-	struct net_device *dev;
 	u8 policy;
 
 	if (!nft_is_base_chain(chain))
 		return -EOPNOTSUPP;
 
 	basechain = nft_base_chain(chain);
-	dev = basechain->ops.dev;
-	if (!dev)
-		return -EOPNOTSUPP;
-
 	policy = ppolicy ? *ppolicy : basechain->policy;
 
 	/* Only default policy to accept is supported for now. */
 	if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP)
 		return -EOPNOTSUPP;
 
-	return nft_flow_block_chain(basechain, dev, cmd);
+	return nft_flow_block_chain(basechain, NULL, cmd);
 }
 
 int nft_flow_rule_offload_commit(struct net *net)
@@ -414,6 +423,7 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev)
 {
 	struct nft_base_chain *basechain;
 	struct net *net = dev_net(dev);
+	struct nft_hook *hook, *found;
 	const struct nft_table *table;
 	struct nft_chain *chain;
 
@@ -426,8 +436,16 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev)
 			    !(chain->flags & NFT_CHAIN_HW_OFFLOAD))
 				continue;
 
+			found = NULL;
 			basechain = nft_base_chain(chain);
-			if (strncmp(basechain->dev_name, dev->name, IFNAMSIZ))
+			list_for_each_entry(hook, &basechain->hook_list, list) {
+				if (hook->ops.dev != dev)
+					continue;
+
+				found = hook;
+				break;
+			}
+			if (!found)
 				continue;
 
 			return chain;
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index b5d5d071d765..c78d01bc02e9 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -287,28 +287,35 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev,
 			     struct nft_ctx *ctx)
 {
 	struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
+	struct nft_hook *hook, *found = NULL;
+	int n = 0;
 
-	switch (event) {
-	case NETDEV_UNREGISTER:
-		if (strcmp(basechain->dev_name, dev->name) != 0)
-			return;
-
-		/* UNREGISTER events are also happpening on netns exit.
-		 *
-		 * Altough nf_tables core releases all tables/chains, only
-		 * this event handler provides guarantee that
-		 * basechain.ops->dev is still accessible, so we cannot
-		 * skip exiting net namespaces.
-		 */
-		__nft_release_basechain(ctx);
-		break;
-	case NETDEV_CHANGENAME:
-		if (dev->ifindex != basechain->ops.dev->ifindex)
-			return;
+	if (event != NETDEV_UNREGISTER)
+		return;
 
-		strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
-		break;
+	list_for_each_entry(hook, &basechain->hook_list, list) {
+		if (hook->ops.dev == dev)
+			found = hook;
+
+		n++;
 	}
+	if (!found)
+		return;
+
+	if (n > 1) {
+		nf_unregister_net_hook(ctx->net, &found->ops);
+		list_del_rcu(&found->list);
+		kfree_rcu(found, rcu);
+		return;
+	}
+
+	/* UNREGISTER events are also happening on netns exit.
+	 *
+	 * Although nf_tables core releases all tables/chains, only this event
+	 * handler provides guarantee that hook->ops.dev is still accessible,
+	 * so we cannot skip exiting net namespaces.
+	 */
+	__nft_release_basechain(ctx);
 }
 
 static int nf_tables_netdev_event(struct notifier_block *this,
-- 
cgit v1.2.3-59-g8ed1b


From fa6e98cee558622565c97924e922b97340aeabd8 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 22 Oct 2019 11:31:07 -0700
Subject: net: phy: add support for clause 37 auto-negotiation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds support for clause 37 1000Base-X auto-negotiation.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Tao Ren <taoren@fb.com>
Tested-by: René van Dorst <opensource@vdorst.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 139 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h          |   4 ++
 2 files changed, 143 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index f1f60bd4865a..fa71998fea51 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1607,6 +1607,40 @@ static int genphy_config_advert(struct phy_device *phydev)
 	return changed;
 }
 
+/**
+ * genphy_c37_config_advert - sanitize and advertise auto-negotiation parameters
+ * @phydev: target phy_device struct
+ *
+ * Description: Writes MII_ADVERTISE with the appropriate values,
+ *   after sanitizing the values to make sure we only advertise
+ *   what is supported.  Returns < 0 on error, 0 if the PHY's advertisement
+ *   hasn't changed, and > 0 if it has changed. This function is intended
+ *   for Clause 37 1000Base-X mode.
+ */
+static int genphy_c37_config_advert(struct phy_device *phydev)
+{
+	u16 adv = 0;
+
+	/* Only allow advertising what this PHY supports */
+	linkmode_and(phydev->advertising, phydev->advertising,
+		     phydev->supported);
+
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT,
+			      phydev->advertising))
+		adv |= ADVERTISE_1000XFULL;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
+			      phydev->advertising))
+		adv |= ADVERTISE_1000XPAUSE;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+			      phydev->advertising))
+		adv |= ADVERTISE_1000XPSE_ASYM;
+
+	return phy_modify_changed(phydev, MII_ADVERTISE,
+				  ADVERTISE_1000XFULL | ADVERTISE_1000XPAUSE |
+				  ADVERTISE_1000XHALF | ADVERTISE_1000XPSE_ASYM,
+				  adv);
+}
+
 /**
  * genphy_config_eee_advert - disable unwanted eee mode advertisement
  * @phydev: target phy_device struct
@@ -1715,6 +1749,54 @@ int __genphy_config_aneg(struct phy_device *phydev, bool changed)
 }
 EXPORT_SYMBOL(__genphy_config_aneg);
 
+/**
+ * genphy_c37_config_aneg - restart auto-negotiation or write BMCR
+ * @phydev: target phy_device struct
+ *
+ * Description: If auto-negotiation is enabled, we configure the
+ *   advertising, and then restart auto-negotiation.  If it is not
+ *   enabled, then we write the BMCR. This function is intended
+ *   for use with Clause 37 1000Base-X mode.
+ */
+int genphy_c37_config_aneg(struct phy_device *phydev)
+{
+	int err, changed;
+
+	if (phydev->autoneg != AUTONEG_ENABLE)
+		return genphy_setup_forced(phydev);
+
+	err = phy_modify(phydev, MII_BMCR, BMCR_SPEED1000 | BMCR_SPEED100,
+			 BMCR_SPEED1000);
+	if (err)
+		return err;
+
+	changed = genphy_c37_config_advert(phydev);
+	if (changed < 0) /* error */
+		return changed;
+
+	if (!changed) {
+		/* Advertisement hasn't changed, but maybe aneg was never on to
+		 * begin with?  Or maybe phy was isolated?
+		 */
+		int ctl = phy_read(phydev, MII_BMCR);
+
+		if (ctl < 0)
+			return ctl;
+
+		if (!(ctl & BMCR_ANENABLE) || (ctl & BMCR_ISOLATE))
+			changed = 1; /* do restart aneg */
+	}
+
+	/* Only restart aneg if we are advertising something different
+	 * than we were before.
+	 */
+	if (changed > 0)
+		return genphy_restart_aneg(phydev);
+
+	return 0;
+}
+EXPORT_SYMBOL(genphy_c37_config_aneg);
+
 /**
  * genphy_aneg_done - return auto-negotiation status
  * @phydev: target phy_device struct
@@ -1886,6 +1968,63 @@ int genphy_read_status(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(genphy_read_status);
 
+/**
+ * genphy_c37_read_status - check the link status and update current link state
+ * @phydev: target phy_device struct
+ *
+ * Description: Check the link, then figure out the current state
+ *   by comparing what we advertise with what the link partner
+ *   advertises. This function is for Clause 37 1000Base-X mode.
+ */
+int genphy_c37_read_status(struct phy_device *phydev)
+{
+	int lpa, err, old_link = phydev->link;
+
+	/* Update the link, but return if there was an error */
+	err = genphy_update_link(phydev);
+	if (err)
+		return err;
+
+	/* why bother the PHY if nothing can have changed */
+	if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link)
+		return 0;
+
+	phydev->duplex = DUPLEX_UNKNOWN;
+	phydev->pause = 0;
+	phydev->asym_pause = 0;
+
+	if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) {
+		lpa = phy_read(phydev, MII_LPA);
+		if (lpa < 0)
+			return lpa;
+
+		linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+				 phydev->lp_advertising, lpa & LPA_LPACK);
+		linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT,
+				 phydev->lp_advertising, lpa & LPA_1000XFULL);
+		linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT,
+				 phydev->lp_advertising, lpa & LPA_1000XPAUSE);
+		linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+				 phydev->lp_advertising,
+				 lpa & LPA_1000XPAUSE_ASYM);
+
+		phy_resolve_aneg_linkmode(phydev);
+	} else if (phydev->autoneg == AUTONEG_DISABLE) {
+		int bmcr = phy_read(phydev, MII_BMCR);
+
+		if (bmcr < 0)
+			return bmcr;
+
+		if (bmcr & BMCR_FULLDPLX)
+			phydev->duplex = DUPLEX_FULL;
+		else
+			phydev->duplex = DUPLEX_HALF;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(genphy_c37_read_status);
+
 /**
  * genphy_soft_reset - software reset the PHY via BMCR_RESET bit
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9a0e981df502..78436d58ce7c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1106,6 +1106,10 @@ int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad,
 int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum,
 				 u16 regnum, u16 val);
 
+/* Clause 37 */
+int genphy_c37_config_aneg(struct phy_device *phydev);
+int genphy_c37_read_status(struct phy_device *phydev);
+
 /* Clause 45 PHY */
 int genphy_c45_restart_aneg(struct phy_device *phydev);
 int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart);
-- 
cgit v1.2.3-59-g8ed1b


From b9bcb95315febd09419ab870ddc7cb98a393f9d0 Mon Sep 17 00:00:00 2001
From: Tao Ren <taoren@fb.com>
Date: Tue, 22 Oct 2019 11:31:08 -0700
Subject: net: phy: broadcom: add 1000Base-X support for BCM54616S

The BCM54616S PHY cannot work properly in RGMII->1000Base-X mode, mainly
because genphy functions are designed for copper links, and 1000Base-X
(clause 37) auto negotiation needs to be handled differently.

This patch enables 1000Base-X support for BCM54616S by customizing 3
driver callbacks, and it's verified to be working on Facebook CMM BMC
platform (RGMII->1000Base-KX):

  - probe: probe callback detects PHY's operation mode based on
    INTERF_SEL[1:0] pins and 1000X/100FX selection bit in SerDES 100-FX
    Control register.

  - config_aneg: calls genphy_c37_config_aneg when the PHY is running in
    1000Base-X mode; otherwise, genphy_config_aneg will be called.

  - read_status: calls genphy_c37_read_status when the PHY is running in
    1000Base-X mode; otherwise, genphy_read_status will be called.

Note: BCM54616S PHY can also be configured in RGMII->100Base-FX mode, and
100Base-FX support is not available as of now.

Signed-off-by: Tao Ren <taoren@fb.com>
Acked-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 57 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/brcmphy.h    | 10 ++++++--
 2 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 4313c74b4fd8..7d68b28bb893 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -359,9 +359,9 @@ static int bcm5482_config_init(struct phy_device *phydev)
 		/*
 		 * Select 1000BASE-X register set (primary SerDes)
 		 */
-		reg = bcm_phy_read_shadow(phydev, BCM5482_SHD_MODE);
-		bcm_phy_write_shadow(phydev, BCM5482_SHD_MODE,
-				     reg | BCM5482_SHD_MODE_1000BX);
+		reg = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE);
+		bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE,
+				     reg | BCM54XX_SHD_MODE_1000BX);
 
 		/*
 		 * LED1=ACTIVITYLED, LED3=LINKSPD[2]
@@ -427,12 +427,47 @@ static int bcm5481_config_aneg(struct phy_device *phydev)
 	return ret;
 }
 
+static int bcm54616s_probe(struct phy_device *phydev)
+{
+	int val, intf_sel;
+
+	val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE);
+	if (val < 0)
+		return val;
+
+	/* The PHY is strapped in RGMII-fiber mode when INTERF_SEL[1:0]
+	 * is 01b, and the link between PHY and its link partner can be
+	 * either 1000Base-X or 100Base-FX.
+	 * RGMII-1000Base-X is properly supported, but RGMII-100Base-FX
+	 * support is still missing as of now.
+	 */
+	intf_sel = (val & BCM54XX_SHD_INTF_SEL_MASK) >> 1;
+	if (intf_sel == 1) {
+		val = bcm_phy_read_shadow(phydev, BCM54616S_SHD_100FX_CTRL);
+		if (val < 0)
+			return val;
+
+		/* Bit 0 of the SerDes 100-FX Control register, when set
+		 * to 1, sets the MII/RGMII -> 100BASE-FX configuration.
+		 * When this bit is set to 0, it sets the GMII/RGMII ->
+		 * 1000BASE-X configuration.
+		 */
+		if (!(val & BCM54616S_100FX_MODE))
+			phydev->dev_flags |= PHY_BCM_FLAGS_MODE_1000BX;
+	}
+
+	return 0;
+}
+
 static int bcm54616s_config_aneg(struct phy_device *phydev)
 {
 	int ret;
 
 	/* Aneg firsly. */
-	ret = genphy_config_aneg(phydev);
+	if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX)
+		ret = genphy_c37_config_aneg(phydev);
+	else
+		ret = genphy_config_aneg(phydev);
 
 	/* Then we can set up the delay. */
 	bcm54xx_config_clock_delay(phydev);
@@ -440,6 +475,18 @@ static int bcm54616s_config_aneg(struct phy_device *phydev)
 	return ret;
 }
 
+static int bcm54616s_read_status(struct phy_device *phydev)
+{
+	int err;
+
+	if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX)
+		err = genphy_c37_read_status(phydev);
+	else
+		err = genphy_read_status(phydev);
+
+	return err;
+}
+
 static int brcm_phy_setbits(struct phy_device *phydev, int reg, int set)
 {
 	int val;
@@ -631,6 +678,8 @@ static struct phy_driver broadcom_drivers[] = {
 	.config_aneg	= bcm54616s_config_aneg,
 	.ack_interrupt	= bcm_phy_ack_intr,
 	.config_intr	= bcm_phy_config_intr,
+	.read_status	= bcm54616s_read_status,
+	.probe		= bcm54616s_probe,
 }, {
 	.phy_id		= PHY_ID_BCM5464,
 	.phy_id_mask	= 0xfffffff0,
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 6db2d9a6e503..b475e7f20d28 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -200,9 +200,15 @@
 #define BCM5482_SHD_SSD		0x14	/* 10100: Secondary SerDes control */
 #define BCM5482_SHD_SSD_LEDM	0x0008	/* SSD LED Mode enable */
 #define BCM5482_SHD_SSD_EN	0x0001	/* SSD enable */
-#define BCM5482_SHD_MODE	0x1f	/* 11111: Mode Control Register */
-#define BCM5482_SHD_MODE_1000BX	0x0001	/* Enable 1000BASE-X registers */
 
+/* 10011: SerDes 100-FX Control Register */
+#define BCM54616S_SHD_100FX_CTRL	0x13
+#define	BCM54616S_100FX_MODE		BIT(0)	/* 100-FX SerDes Enable */
+
+/* 11111: Mode Control Register */
+#define BCM54XX_SHD_MODE		0x1f
+#define BCM54XX_SHD_INTF_SEL_MASK	GENMASK(2, 1)	/* INTERF_SEL[1:0] */
+#define BCM54XX_SHD_MODE_1000BX		BIT(0)	/* Enable 1000-X registers */
 
 /*
  * EXPANSION SHADOW ACCESS REGISTERS.  (PHY REG 0x15, 0x16, and 0x17)
-- 
cgit v1.2.3-59-g8ed1b


From 3820729160440158a014add69cc0d371061a96b2 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 24 Oct 2019 17:18:11 -0700
Subject: bpf: Prepare btf_ctx_access for non raw_tp use case

This patch makes a few changes to btf_ctx_access() to prepare
it for non raw_tp use case where the attach_btf_id is not
necessary a BTF_KIND_TYPEDEF.

It moves the "btf_trace_" prefix check and typedef-follow logic to a new
function "check_attach_btf_id()" which is called only once during
bpf_check().  btf_ctx_access() only operates on a BTF_KIND_FUNC_PROTO
type now. That should also be more efficient since it is done only
one instead of every-time check_ctx_access() is called.

"check_attach_btf_id()" needs to find the func_proto type from
the attach_btf_id.  It needs to store the result into the
newly added prog->aux->attach_func_proto.  func_proto
btf type has no name, so a proper name should be stored into
"attach_func_name" also.

v2:
- Move the "btf_trace_" check to an earlier verifier phase (Alexei)

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20191025001811.1718491-1-kafai@fb.com
---
 include/linux/bpf.h      |  5 ++++
 include/linux/btf.h      | 31 ++++++++++++++++++++
 kernel/bpf/btf.c         | 73 +++++++++---------------------------------------
 kernel/bpf/syscall.c     |  4 +--
 kernel/bpf/verifier.c    | 52 +++++++++++++++++++++++++++++++++-
 kernel/trace/bpf_trace.c |  2 ++
 6 files changed, 103 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2c2c29b49845..171be30fe0ae 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -392,6 +392,11 @@ struct bpf_prog_aux {
 	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
 	bool offload_requested;
+	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
+	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
+	const struct btf_type *attach_func_proto;
+	/* function name for valid attach_btf_id */
+	const char *attach_func_name;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct latch_tree_node ksym_tnode;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 55d43bc856be..9dee00859c5f 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -5,6 +5,7 @@
 #define _LINUX_BTF_H 1
 
 #include <linux/types.h>
+#include <uapi/linux/btf.h>
 
 struct btf;
 struct btf_member;
@@ -53,6 +54,36 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
 bool btf_type_is_void(const struct btf_type *t);
 
+static inline bool btf_type_is_ptr(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
+}
+
+static inline bool btf_type_is_int(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
+}
+
+static inline bool btf_type_is_enum(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM;
+}
+
+static inline bool btf_type_is_typedef(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF;
+}
+
+static inline bool btf_type_is_func(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
+}
+
+static inline bool btf_type_is_func_proto(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f7557af39756..128d89601d73 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -336,16 +336,6 @@ static bool btf_type_is_fwd(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
 }
 
-static bool btf_type_is_func(const struct btf_type *t)
-{
-	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC;
-}
-
-static bool btf_type_is_func_proto(const struct btf_type *t)
-{
-	return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
-}
-
 static bool btf_type_nosize(const struct btf_type *t)
 {
 	return btf_type_is_void(t) || btf_type_is_fwd(t) ||
@@ -377,16 +367,6 @@ static bool btf_type_is_array(const struct btf_type *t)
 	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
 }
 
-static bool btf_type_is_ptr(const struct btf_type *t)
-{
-	return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
-}
-
-static bool btf_type_is_int(const struct btf_type *t)
-{
-	return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
-}
-
 static bool btf_type_is_var(const struct btf_type *t)
 {
 	return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
@@ -3442,54 +3422,27 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info)
 {
+	const struct btf_type *t = prog->aux->attach_func_proto;
+	const char *tname = prog->aux->attach_func_name;
 	struct bpf_verifier_log *log = info->log;
-	u32 btf_id = prog->aux->attach_btf_id;
 	const struct btf_param *args;
-	const struct btf_type *t;
-	const char prefix[] = "btf_trace_";
-	const char *tname;
 	u32 nr_args, arg;
 
-	if (!btf_id)
-		return true;
-
-	if (IS_ERR(btf_vmlinux)) {
-		bpf_log(log, "btf_vmlinux is malformed\n");
-		return false;
-	}
-
-	t = btf_type_by_id(btf_vmlinux, btf_id);
-	if (!t || BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
-		bpf_log(log, "btf_id is invalid\n");
-		return false;
-	}
-
-	tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
-	if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
-		bpf_log(log, "btf_id points to wrong type name %s\n", tname);
-		return false;
-	}
-	tname += sizeof(prefix) - 1;
-
-	t = btf_type_by_id(btf_vmlinux, t->type);
-	if (!btf_type_is_ptr(t))
-		return false;
-	t = btf_type_by_id(btf_vmlinux, t->type);
-	if (!btf_type_is_func_proto(t))
-		return false;
-
 	if (off % 8) {
-		bpf_log(log, "raw_tp '%s' offset %d is not multiple of 8\n",
+		bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
 			tname, off);
 		return false;
 	}
 	arg = off / 8;
 	args = (const struct btf_param *)(t + 1);
-	/* skip first 'void *__data' argument in btf_trace_##name typedef */
-	args++;
-	nr_args = btf_type_vlen(t) - 1;
+	nr_args = btf_type_vlen(t);
+	if (prog->aux->attach_btf_trace) {
+		/* skip first 'void *__data' argument in btf_trace_##name typedef */
+		args++;
+		nr_args--;
+	}
 	if (arg >= nr_args) {
-		bpf_log(log, "raw_tp '%s' doesn't have %d-th argument\n",
+		bpf_log(log, "func '%s' doesn't have %d-th argument\n",
 			tname, arg);
 		return false;
 	}
@@ -3503,7 +3456,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		return true;
 	if (!btf_type_is_ptr(t)) {
 		bpf_log(log,
-			"raw_tp '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
+			"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
 			tname, arg,
 			__btf_name_by_offset(btf_vmlinux, t->name_off),
 			btf_kind_str[BTF_INFO_KIND(t->info)]);
@@ -3526,11 +3479,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		t = btf_type_by_id(btf_vmlinux, t->type);
 	if (!btf_type_is_struct(t)) {
 		bpf_log(log,
-			"raw_tp '%s' arg%d type %s is not a struct\n",
+			"func '%s' arg%d type %s is not a struct\n",
 			tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
 		return false;
 	}
-	bpf_log(log, "raw_tp '%s' arg%d has btf_id %d type %s '%s'\n",
+	bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
 		tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
 		__btf_name_by_offset(btf_vmlinux, t->name_off));
 	return true;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 16ea3c0db4f6..ff5225759553 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1848,9 +1848,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 			goto out_put_prog;
 		}
 		/* raw_tp name is taken from type name instead */
-		tp_name = kernel_type_name(prog->aux->attach_btf_id);
-		/* skip the prefix */
-		tp_name += sizeof("btf_trace_") - 1;
+		tp_name = prog->aux->attach_func_name;
 	} else {
 		if (strncpy_from_user(buf,
 				      u64_to_user_ptr(attr->raw_tracepoint.name),
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 556e82f8869b..c59778c0fc4d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9372,6 +9372,52 @@ static void print_verification_stats(struct bpf_verifier_env *env)
 		env->peak_states, env->longest_mark_read_walk);
 }
 
+static int check_attach_btf_id(struct bpf_verifier_env *env)
+{
+	struct bpf_prog *prog = env->prog;
+	u32 btf_id = prog->aux->attach_btf_id;
+	const struct btf_type *t;
+	const char *tname;
+
+	if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && btf_id) {
+		const char prefix[] = "btf_trace_";
+
+		t = btf_type_by_id(btf_vmlinux, btf_id);
+		if (!t) {
+			verbose(env, "attach_btf_id %u is invalid\n", btf_id);
+			return -EINVAL;
+		}
+		if (!btf_type_is_typedef(t)) {
+			verbose(env, "attach_btf_id %u is not a typedef\n",
+				btf_id);
+			return -EINVAL;
+		}
+		tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+		if (!tname || strncmp(prefix, tname, sizeof(prefix) - 1)) {
+			verbose(env, "attach_btf_id %u points to wrong type name %s\n",
+				btf_id, tname);
+			return -EINVAL;
+		}
+		tname += sizeof(prefix) - 1;
+		t = btf_type_by_id(btf_vmlinux, t->type);
+		if (!btf_type_is_ptr(t))
+			/* should never happen in valid vmlinux build */
+			return -EINVAL;
+		t = btf_type_by_id(btf_vmlinux, t->type);
+		if (!btf_type_is_func_proto(t))
+			/* should never happen in valid vmlinux build */
+			return -EINVAL;
+
+		/* remember two read only pointers that are valid for
+		 * the life time of the kernel
+		 */
+		prog->aux->attach_func_name = tname;
+		prog->aux->attach_func_proto = t;
+		prog->aux->attach_btf_trace = true;
+	}
+	return 0;
+}
+
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 	      union bpf_attr __user *uattr)
 {
@@ -9435,9 +9481,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		/* Either gcc or pahole or kernel are broken. */
 		verbose(env, "in-kernel BTF is malformed\n");
 		ret = PTR_ERR(btf_vmlinux);
-		goto err_unlock;
+		goto skip_full_check;
 	}
 
+	ret = check_attach_btf_id(env);
+	if (ret)
+		goto skip_full_check;
+
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c3240898cc44..571c25d60710 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1080,6 +1080,8 @@ static bool raw_tp_prog_is_valid_access(int off, int size,
 		return false;
 	if (off % size != 0)
 		return false;
+	if (!prog->aux->attach_btf_id)
+		return true;
 	return btf_ctx_access(off, size, type, prog, info);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 480274787d7e3458bc5a7cfbbbe07033984ad711 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Wed, 23 Oct 2019 11:09:26 -0400
Subject: tcp: add TCP_INFO status for failed client TFO

The TCPI_OPT_SYN_DATA bit as part of tcpi_options currently reports whether
or not data-in-SYN was ack'd on both the client and server side. We'd like
to gather more information on the client-side in the failure case in order
to indicate the reason for the failure. This can be useful for not only
debugging TFO, but also for creating TFO socket policies. For example, if
a middle box removes the TFO option or drops a data-in-SYN, we can
can detect this case, and turn off TFO for these connections saving the
extra retransmits.

The newly added tcpi_fastopen_client_fail status is 2 bits and has the
following 4 states:

1) TFO_STATUS_UNSPEC

Catch-all state which includes when TFO is disabled via black hole
detection, which is indicated via LINUX_MIB_TCPFASTOPENBLACKHOLE.

2) TFO_COOKIE_UNAVAILABLE

If TFO_CLIENT_NO_COOKIE mode is off, this state indicates that no cookie
is available in the cache.

3) TFO_DATA_NOT_ACKED

Data was sent with SYN, we received a SYN/ACK but it did not cover the data
portion. Cookie is not accepted by server because the cookie may be invalid
or the server may be overloaded.

4) TFO_SYN_RETRANSMITTED

Data was sent with SYN, we received a SYN/ACK which did not cover the data
after at least 1 additional SYN was sent (without data). It may be the case
that a middle-box is dropping data-in-SYN packets. Thus, it would be more
efficient to not use TFO on this connection to avoid extra retransmits
during connection establishment.

These new fields do not cover all the cases where TFO may fail, but other
failures, such as SYN/ACK + data being dropped, will result in the
connection not becoming established. And a connection blackhole after
session establishment shows up as a stalled connection.

Signed-off-by: Jason Baron <jbaron@akamai.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Christoph Paasch <cpaasch@apple.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  2 +-
 include/uapi/linux/tcp.h | 10 +++++++++-
 net/ipv4/tcp.c           |  2 ++
 net/ipv4/tcp_fastopen.c  |  5 ++++-
 net/ipv4/tcp_input.c     |  4 ++++
 5 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 668e25a76d69..ca6f01531e64 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -223,7 +223,7 @@ struct tcp_sock {
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
 		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
-		unused:2;
+		fastopen_client_fail:2; /* reason why fastopen failed */
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 81e697978e8b..74af1f759cee 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -155,6 +155,14 @@ enum {
 	TCP_QUEUES_NR,
 };
 
+/* why fastopen failed from client perspective */
+enum tcp_fastopen_client_fail {
+	TFO_STATUS_UNSPEC, /* catch-all */
+	TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */
+	TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */
+	TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */
+};
+
 /* for TCP_INFO socket option */
 #define TCPI_OPT_TIMESTAMPS	1
 #define TCPI_OPT_SACK		2
@@ -211,7 +219,7 @@ struct tcp_info {
 	__u8	tcpi_backoff;
 	__u8	tcpi_options;
 	__u8	tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
-	__u8	tcpi_delivery_rate_app_limited:1;
+	__u8	tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2;
 
 	__u32	tcpi_rto;
 	__u32	tcpi_ato;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9d69e1da93f2..8fc1e8b6d408 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2666,6 +2666,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	/* Clean up fastopen related fields */
 	tcp_free_fastopen_req(tp);
 	inet->defer_connect = 0;
+	tp->fastopen_client_fail = 0;
 
 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
 
@@ -3305,6 +3306,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_reord_seen = tp->reord_seen;
 	info->tcpi_rcv_ooopack = tp->rcv_ooopack;
 	info->tcpi_snd_wnd = tp->snd_wnd;
+	info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
 	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a915ade0c818..19ad9586c720 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -422,7 +422,10 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 		cookie->len = -1;
 		return true;
 	}
-	return cookie->len > 0;
+	if (cookie->len > 0)
+		return true;
+	tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE;
+	return false;
 }
 
 /* This function checks if we want to defer sending SYN until the first
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a2e52ad7cdab..88b987ca9ebb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5814,6 +5814,10 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
 
 	if (data) { /* Retransmit unacked data in SYN */
+		if (tp->total_retrans)
+			tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
+		else
+			tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
 		skb_rbtree_walk_from(data) {
 			if (__tcp_retransmit_skb(sk, data, 1))
 				break;
-- 
cgit v1.2.3-59-g8ed1b


From ae4a50ee3151d6cb11c56297699ca9025eb18077 Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Fri, 25 Oct 2019 10:36:47 +1300
Subject: mac80211: typo fixes in kerneldoc comments

Correct some trivial typos in kerneldoc comments.

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Link: https://lore.kernel.org/r/20191024213647.5507-1-chris.packham@alliedtelesis.co.nz
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d69081c38788..67866fa1328d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -312,7 +312,7 @@ struct ieee80211_vif_chanctx_switch {
  * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected
  *	keep alive) changed.
  * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface
- * @BSS_CHANGED_FTM_RESPONDER: fime timing reasurement request responder
+ * @BSS_CHANGED_FTM_RESPONDER: fine timing measurement request responder
  *	functionality changed for this BSS (AP mode).
  * @BSS_CHANGED_TWT: TWT status changed
  * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed.
@@ -1059,7 +1059,7 @@ struct ieee80211_tx_info {
 };
 
 /**
- * struct ieee80211_tx_status - extended tx staus info for rate control
+ * struct ieee80211_tx_status - extended tx status info for rate control
  *
  * @sta: Station that the packet was transmitted for
  * @info: Basic tx status information
@@ -1702,7 +1702,7 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif);
  *	%IEEE80211_KEY_FLAG_SW_MGMT_TX flag to encrypt such frames in SW.
  * @IEEE80211_KEY_FLAG_GENERATE_IV_MGMT: This flag should be set by the
  *	driver for a CCMP/GCMP key to indicate that is requires IV generation
- *	only for managment frames (MFP).
+ *	only for management frames (MFP).
  * @IEEE80211_KEY_FLAG_RESERVE_TAILROOM: This flag should be set by the
  *	driver for a key to indicate that sufficient tailroom must always
  *	be reserved for ICV or MIC, even when HW encryption is enabled.
@@ -1998,7 +1998,7 @@ struct ieee80211_sta {
 	 *
 	 * * If the skb is transmitted as part of a BA agreement, the
 	 *   A-MSDU maximal size is min(max_amsdu_len, 4065) bytes.
-	 * * If the skb is not part of a BA aggreement, the A-MSDU maximal
+	 * * If the skb is not part of a BA agreement, the A-MSDU maximal
 	 *   size is min(max_amsdu_len, 7935) bytes.
 	 *
 	 * Both additional HT limits must be enforced by the low level
@@ -3187,13 +3187,13 @@ enum ieee80211_rate_control_changed {
  *
  * With the support for multi channel contexts and multi channel operations,
  * remain on channel operations might be limited/deferred/aborted by other
- * flows/operations which have higher priority (and vise versa).
+ * flows/operations which have higher priority (and vice versa).
  * Specifying the ROC type can be used by devices to prioritize the ROC
  * operations compared to other operations/flows.
  *
  * @IEEE80211_ROC_TYPE_NORMAL: There are no special requirements for this ROC.
  * @IEEE80211_ROC_TYPE_MGMT_TX: The remain on channel request is required
- *	for sending managment frames offchannel.
+ *	for sending management frames offchannel.
  */
 enum ieee80211_roc_type {
 	IEEE80211_ROC_TYPE_NORMAL = 0,
@@ -5616,7 +5616,7 @@ void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw,
 
 /**
  * ieee80211_iter_chan_contexts_atomic - iterate channel contexts
- * @hw: pointre obtained from ieee80211_alloc_hw().
+ * @hw: pointer obtained from ieee80211_alloc_hw().
  * @iter: iterator function
  * @iter_data: data passed to iterator function
  *
@@ -6364,7 +6364,7 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq,
  * again.
  *
  * The API ieee80211_txq_may_transmit() also ensures that TXQ list will be
- * aligned aginst driver's own round-robin scheduler list. i.e it rotates
+ * aligned against driver's own round-robin scheduler list. i.e it rotates
  * the TXQ list till it makes the requested node becomes the first entry
  * in TXQ list. Thus both the TXQ list and driver's list are in sync. If this
  * function returns %true, the driver is expected to schedule packets
-- 
cgit v1.2.3-59-g8ed1b


From 3f2aef10ffad76c31275ae66b1d6e486b22619d6 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 24 Oct 2019 11:32:12 -0700
Subject: mac80211: fix a typo of "function"

Signed-off-by: Joe Perches <joe@perches.com>
Link: https://lore.kernel.org/r/4d53be6c963542878d370ff1a6dc7c3a89b28d23.camel@perches.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 67866fa1328d..f5996960eace 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2626,7 +2626,7 @@ ieee80211_get_alt_retry_rate(const struct ieee80211_hw *hw,
  * @hw: the hardware
  * @skb: the skb
  *
- * Free a transmit skb. Use this funtion when some failure
+ * Free a transmit skb. Use this function when some failure
  * to transmit happened and thus status cannot be reported.
  */
 void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
-- 
cgit v1.2.3-59-g8ed1b


From c199ce4f9dd896c716aece33e6750be34aea1151 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 24 Oct 2019 17:22:01 +0200
Subject: net: Fix misspellings of "configure" and "configuration"

Fix various misspellings of "configuration" and "configure".

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c | 2 +-
 drivers/net/ethernet/qlogic/qed/qed_int.h                | 4 ++--
 drivers/net/ethernet/qlogic/qed/qed_sriov.h              | 2 +-
 drivers/net/ethernet/qlogic/qede/qede_filter.c           | 2 +-
 drivers/net/wireless/ath/ath9k/ar9003_hw.c               | 2 +-
 drivers/net/wireless/intel/iwlwifi/iwl-fh.h              | 2 +-
 drivers/net/wireless/ti/wlcore/spi.c                     | 2 +-
 include/uapi/linux/dcbnl.h                               | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
index 633b117eb13e..7b672ada63a3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c
@@ -175,7 +175,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer,
  *	@port_buffer: <output> port receive buffer configuration
  *	@change: <output>
  *
- *	Update buffer configuration based on pfc configuraiton and
+ *	Update buffer configuration based on pfc configuration and
  *	priority to buffer mapping.
  *	Buffer's lossy bit is changed to:
  *		lossless if there is at least one PFC enabled priority
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.h b/drivers/net/ethernet/qlogic/qed/qed_int.h
index d473b522afc5..9ad568d93ae6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.h
@@ -37,14 +37,14 @@
 #include <linux/slab.h>
 #include "qed.h"
 
-/* Fields of IGU PF CONFIGRATION REGISTER */
+/* Fields of IGU PF CONFIGURATION REGISTER */
 #define IGU_PF_CONF_FUNC_EN       (0x1 << 0)    /* function enable        */
 #define IGU_PF_CONF_MSI_MSIX_EN   (0x1 << 1)    /* MSI/MSIX enable        */
 #define IGU_PF_CONF_INT_LINE_EN   (0x1 << 2)    /* INT enable             */
 #define IGU_PF_CONF_ATTN_BIT_EN   (0x1 << 3)    /* attention enable       */
 #define IGU_PF_CONF_SINGLE_ISR_EN (0x1 << 4)    /* single ISR mode enable */
 #define IGU_PF_CONF_SIMD_MODE     (0x1 << 5)    /* simd all ones mode     */
-/* Fields of IGU VF CONFIGRATION REGISTER */
+/* Fields of IGU VF CONFIGURATION REGISTER */
 #define IGU_VF_CONF_FUNC_EN        (0x1 << 0)	/* function enable        */
 #define IGU_VF_CONF_MSI_MSIX_EN    (0x1 << 1)	/* MSI/MSIX enable        */
 #define IGU_VF_CONF_SINGLE_ISR_EN  (0x1 << 4)	/* single ISR mode enable */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 9a8fd79611f2..368e88565783 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -305,7 +305,7 @@ void qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn,
 
 /**
  * @brief Read sriov related information and allocated resources
- *  reads from configuraiton space, shmem, etc.
+ *  reads from configuration space, shmem, etc.
  *
  * @param p_hwfn
  *
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 9a6a9a008714..d6cfe4ffbaf3 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1298,7 +1298,7 @@ void qede_config_rx_mode(struct net_device *ndev)
 	rx_mode.type = QED_FILTER_TYPE_RX_MODE;
 
 	/* Remove all previous unicast secondary macs and multicast macs
-	 * (configrue / leave the primary mac)
+	 * (configure / leave the primary mac)
 	 */
 	rc = qede_set_ucast_rx_mac(edev, QED_FILTER_XCAST_TYPE_REPLACE,
 				   edev->ndev->dev_addr);
diff --git a/drivers/net/wireless/ath/ath9k/ar9003_hw.c b/drivers/net/wireless/ath/ath9k/ar9003_hw.c
index 2fe12b0de5b4..42f00a2a8c80 100644
--- a/drivers/net/wireless/ath/ath9k/ar9003_hw.c
+++ b/drivers/net/wireless/ath/ath9k/ar9003_hw.c
@@ -1037,7 +1037,7 @@ static void ar9003_hw_configpcipowersave(struct ath_hw *ah,
 	}
 
 	/*
-	 * Configire PCIE after Ini init. SERDES values now come from ini file
+	 * Configure PCIE after Ini init. SERDES values now come from ini file
 	 * This enables PCIe low power mode.
 	 */
 	array = power_off ? &ah->iniPcieSerdes :
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-fh.h b/drivers/net/wireless/intel/iwlwifi/iwl-fh.h
index 0c12df558240..05c1c77c88a0 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-fh.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-fh.h
@@ -148,7 +148,7 @@ static inline unsigned int FH_MEM_CBBC_QUEUE(struct iwl_trans *trans,
  *
  * Bits 3:0:
  * Define the maximum number of pending read requests.
- * Maximum configration value allowed is 0xC
+ * Maximum configuration value allowed is 0xC
  * Bits 9:8:
  * Define the maximum transfer size. (64 / 128 / 256)
  * Bit 10:
diff --git a/drivers/net/wireless/ti/wlcore/spi.c b/drivers/net/wireless/ti/wlcore/spi.c
index d4c09e54fd63..18c4d998ce4b 100644
--- a/drivers/net/wireless/ti/wlcore/spi.c
+++ b/drivers/net/wireless/ti/wlcore/spi.c
@@ -186,7 +186,7 @@ static void wl12xx_spi_init(struct device *child)
 
 	spi_sync(to_spi_device(glue->dev), &m);
 
-	/* Restore chip select configration to normal */
+	/* Restore chip select configuration to normal */
 	spi->mode ^= SPI_CS_HIGH;
 	kfree(cmd);
 }
diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index 69df19aa8e72..a791a94013a6 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -286,7 +286,7 @@ struct dcbmsg {
  * @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported
  * @DCB_CMD_SNUMTCS: set the number of traffic classes
  * @DCB_CMD_GBCN: set backward congestion notification configuration
- * @DCB_CMD_SBCN: get backward congestion notification configration.
+ * @DCB_CMD_SBCN: get backward congestion notification configuration.
  * @DCB_CMD_GAPP: get application protocol configuration
  * @DCB_CMD_SAPP: set application protocol configuration
  * @DCB_CMD_IEEE_SET: set IEEE 802.1Qaz configuration
-- 
cgit v1.2.3-59-g8ed1b


From e1b185491f739983b596804953586346e50351c9 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 24 Oct 2019 17:23:23 +0200
Subject: net: Fix various misspellings of "connect"

Fix misspellings of "disconnect", "disconnecting", "connections", and
"disconnected".

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Kalle Valo <kvalo@codeaurora.org>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wimax/i2400m/usb.c                      | 2 +-
 drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c | 4 ++--
 include/net/cfg80211.h                              | 2 +-
 net/netfilter/ipvs/ip_vs_ovf.c                      | 2 +-
 net/wireless/reg.h                                  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wimax/i2400m/usb.c b/drivers/net/wimax/i2400m/usb.c
index 6953f904232f..9659f9e1aaa6 100644
--- a/drivers/net/wimax/i2400m/usb.c
+++ b/drivers/net/wimax/i2400m/usb.c
@@ -511,7 +511,7 @@ error_alloc_netdev:
 
 
 /*
- * Disconect a i2400m from the system.
+ * Disconnect a i2400m from the system.
  *
  * i2400m_stop() has been called before, so al the rx and tx contexts
  * have been taken down already. Make sure the queue is stopped,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c
index 6d6e8994460d..81313e0ca834 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c
@@ -1352,9 +1352,9 @@ static void _rtl92s_phy_set_rfhalt(struct ieee80211_hw *hw)
 	/* SW/HW radio off or halt adapter!! For example S3/S4 */
 	} else {
 		/* LED function disable. Power range is about 8mA now. */
-		/* if write 0xF1 disconnet_pci power
+		/* if write 0xF1 disconnect_pci power
 		 *	 ifconfig wlan0 down power are both high 35:70 */
-		/* if write oxF9 disconnet_pci power
+		/* if write oxF9 disconnect_pci power
 		 * ifconfig wlan0 down power are both low  12:45*/
 		rtl_write_byte(rtlpriv, 0x03, 0xF9);
 	}
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 4ab2c49423dc..ab6850bbba99 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6593,7 +6593,7 @@ struct cfg80211_roam_info {
  * time it is accessed in __cfg80211_roamed() due to delay in scheduling
  * rdev->event_work. In case of any failures, the reference is released
  * either in cfg80211_roamed() or in __cfg80211_romed(), Otherwise, it will be
- * released while diconneting from the current bss.
+ * released while disconnecting from the current bss.
  */
 void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
 		     gfp_t gfp);
diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c
index 78b074cd5464..c03066fdd5ca 100644
--- a/net/netfilter/ipvs/ip_vs_ovf.c
+++ b/net/netfilter/ipvs/ip_vs_ovf.c
@@ -5,7 +5,7 @@
  * Authors:     Raducu Deaconu <rhadoo_io@yahoo.com>
  *
  * Scheduler implements "overflow" loadbalancing according to number of active
- * connections , will keep all conections to the node with the highest weight
+ * connections , will keep all connections to the node with the highest weight
  * and overflow to the next node if the number of connections exceeds the node's
  * weight.
  * Note that this scheduler might not be suitable for UDP because it only uses
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index dc8f689bd469..f9e83031a40a 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -114,7 +114,7 @@ void regulatory_hint_country_ie(struct wiphy *wiphy,
 			 u8 country_ie_len);
 
 /**
- * regulatory_hint_disconnect - informs all devices have been disconneted
+ * regulatory_hint_disconnect - informs all devices have been disconnected
  *
  * Regulotory rules can be enhanced further upon scanning and upon
  * connection to an AP. These rules become stale if we disconnect
-- 
cgit v1.2.3-59-g8ed1b


From 6b297524234ccf3954b54609ab6bc2e8c4d3f677 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Fri, 25 Oct 2019 01:03:51 +0200
Subject: net: dsa: Add support for devlink device parameters

Add plumbing to allow DSA drivers to register parameters with devlink.

To keep with the abstraction, the DSA drivers pass the ds structure to
these helpers, and the DSA core then translates that to the devlink
structure associated to the device.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 23 +++++++++++++++++++++++
 net/dsa/dsa.c     | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/dsa/dsa2.c    |  7 ++++++-
 3 files changed, 77 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index e3c14dc3bab9..d5f6e5ccca38 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -550,6 +550,29 @@ struct dsa_switch_ops {
 	 */
 	netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port,
 					  struct sk_buff *skb);
+	/* Devlink parameters */
+	int	(*devlink_param_get)(struct dsa_switch *ds, u32 id,
+				     struct devlink_param_gset_ctx *ctx);
+	int	(*devlink_param_set)(struct dsa_switch *ds, u32 id,
+				     struct devlink_param_gset_ctx *ctx);
+};
+
+#define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)		\
+	DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes,		\
+			     dsa_devlink_param_get, dsa_devlink_param_set, NULL)
+
+int dsa_devlink_param_get(struct devlink *dl, u32 id,
+			  struct devlink_param_gset_ctx *ctx);
+int dsa_devlink_param_set(struct devlink *dl, u32 id,
+			  struct devlink_param_gset_ctx *ctx);
+int dsa_devlink_params_register(struct dsa_switch *ds,
+				const struct devlink_param *params,
+				size_t params_count);
+void dsa_devlink_params_unregister(struct dsa_switch *ds,
+				   const struct devlink_param *params,
+				   size_t params_count);
+struct dsa_devlink_priv {
+	struct dsa_switch *ds;
 };
 
 struct dsa_switch_driver {
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index a5545762f5e7..db1c1c7e40e9 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -331,6 +331,54 @@ int call_dsa_notifiers(unsigned long val, struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(call_dsa_notifiers);
 
+int dsa_devlink_param_get(struct devlink *dl, u32 id,
+			  struct devlink_param_gset_ctx *ctx)
+{
+	struct dsa_devlink_priv *dl_priv;
+	struct dsa_switch *ds;
+
+	dl_priv = devlink_priv(dl);
+	ds = dl_priv->ds;
+
+	if (!ds->ops->devlink_param_get)
+		return -EOPNOTSUPP;
+
+	return ds->ops->devlink_param_get(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_get);
+
+int dsa_devlink_param_set(struct devlink *dl, u32 id,
+			  struct devlink_param_gset_ctx *ctx)
+{
+	struct dsa_devlink_priv *dl_priv;
+	struct dsa_switch *ds;
+
+	dl_priv = devlink_priv(dl);
+	ds = dl_priv->ds;
+
+	if (!ds->ops->devlink_param_set)
+		return -EOPNOTSUPP;
+
+	return ds->ops->devlink_param_set(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_set);
+
+int dsa_devlink_params_register(struct dsa_switch *ds,
+				const struct devlink_param *params,
+				size_t params_count)
+{
+	return devlink_params_register(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_register);
+
+void dsa_devlink_params_unregister(struct dsa_switch *ds,
+				   const struct devlink_param *params,
+				   size_t params_count)
+{
+	devlink_params_unregister(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister);
+
 static int __init dsa_init_module(void)
 {
 	int rc;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 214dd703b0cc..e7aae96b54bb 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -349,6 +349,7 @@ static void dsa_port_teardown(struct dsa_port *dp)
 
 static int dsa_switch_setup(struct dsa_switch *ds)
 {
+	struct dsa_devlink_priv *dl_priv;
 	int err;
 
 	if (ds->setup)
@@ -364,9 +365,11 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	/* Add the switch to devlink before calling setup, so that setup can
 	 * add dpipe tables
 	 */
-	ds->devlink = devlink_alloc(&dsa_devlink_ops, 0);
+	ds->devlink = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv));
 	if (!ds->devlink)
 		return -ENOMEM;
+	dl_priv = devlink_priv(ds->devlink);
+	dl_priv->ds = ds;
 
 	err = devlink_register(ds->devlink, ds->dev);
 	if (err)
@@ -380,6 +383,8 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	if (err < 0)
 		goto unregister_notifier;
 
+	devlink_params_publish(ds->devlink);
+
 	if (!ds->slave_mii_bus && ds->ops->phy_read) {
 		ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
 		if (!ds->slave_mii_bus) {
-- 
cgit v1.2.3-59-g8ed1b


From d607525bd912860aad137326a1076d1e9880ddf0 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Fri, 25 Oct 2019 14:48:53 -0400
Subject: net: dsa: return directly from dsa_to_port

Return directly from within the loop as soon as the port is found,
otherwise we won't return NULL if the end of the list is reached.

Fixes: b96ddf254b09 ("net: dsa: use ports list in dsa_to_port")
Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index d5f6e5ccca38..9aba326abb64 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -283,13 +283,13 @@ struct dsa_switch {
 static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
 {
 	struct dsa_switch_tree *dst = ds->dst;
-	struct dsa_port *dp = NULL;
+	struct dsa_port *dp;
 
 	list_for_each_entry(dp, &dst->ports, list)
 		if (dp->ds == ds && dp->index == p)
-			break;
+			return dp;
 
-	return dp;
+	return NULL;
 }
 
 static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
-- 
cgit v1.2.3-59-g8ed1b


From 8466a57dfbb0c9bf6db4685ed9c4144b8deec688 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Tue, 29 Oct 2019 12:43:46 +0100
Subject: net/smc: remove unneeded include for smc.h

The only smc-related reference in net/sock.h is struct smc_hashinfo.
But just its address is refered to. Thus there is no need for the
include of net/smc.h. Remove it.

Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 380312cc67a9..09c26a5ecbff 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -66,7 +66,6 @@
 #include <net/checksum.h>
 #include <net/tcp_states.h>
 #include <linux/net_tstamp.h>
-#include <net/smc.h>
 #include <net/l3mdev.h>
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From c0bceb97db9efc72629dd00cd0d9812f24d4ba2d Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 30 Oct 2019 14:00:41 +0100
Subject: tipc: add smart nagle feature

We introduce a feature that works like a combination of TCP_NAGLE and
TCP_CORK, but without some of the weaknesses of those. In particular,
we will not observe long delivery delays because of delayed acks, since
the algorithm itself decides if and when acks are to be sent from the
receiving peer.

- The nagle property as such is determined by manipulating a new
  'maxnagle' field in struct tipc_sock. If certain conditions are met,
  'maxnagle' will define max size of the messages which can be bundled.
  If it is set to zero no messages are ever bundled, implying that the
  nagle property is disabled.
- A socket with the nagle property enabled enters nagle mode when more
  than 4 messages have been sent out without receiving any data message
  from the peer.
- A socket leaves nagle mode whenever it receives a data message from
  the peer.

In nagle mode, messages smaller than 'maxnagle' are accumulated in the
socket write queue. The last buffer in the queue is marked with a new
'ack_required' bit, which forces the receiving peer to send a CONN_ACK
message back to the sender upon reception.

The accumulated contents of the write queue is transmitted when one of
the following events or conditions occur.

- A CONN_ACK message is received from the peer.
- A data message is received from the peer.
- A SOCK_WAKEUP pseudo message is received from the link level.
- The write queue contains more than 64 1k blocks of data.
- The connection is being shut down.
- There is no CONN_ACK message to expect. I.e., there is currently
  no outstanding message where the 'ack_required' bit was set. As a
  consequence, the first message added after we enter nagle mode
  is always sent directly with this bit set.

This new feature gives a 50-100% improvement of throughput for small
(i.e., less than MTU size) messages, while it might add up to one RTT
to latency time when the socket is in nagle mode.

Acked-by: Ying Xue <ying.xue@windreiver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h |   1 +
 net/tipc/msg.c            |  53 +++++++++++++++++++++
 net/tipc/msg.h            |  12 +++++
 net/tipc/node.h           |   7 ++-
 net/tipc/socket.c         | 117 +++++++++++++++++++++++++++++++++++++++-------
 5 files changed, 170 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 7df026ea6aff..76421b878767 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -191,6 +191,7 @@ struct sockaddr_tipc {
 #define TIPC_GROUP_JOIN         135     /* Takes struct tipc_group_req* */
 #define TIPC_GROUP_LEAVE        136     /* No argument */
 #define TIPC_SOCK_RECVQ_USED    137     /* Default: none (read only) */
+#define TIPC_NODELAY            138     /* Default: false */
 
 /*
  * Flag values
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 922d262e153f..973795a1a968 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -190,6 +190,59 @@ err:
 	return 0;
 }
 
+/**
+ * tipc_msg_append(): Append data to tail of an existing buffer queue
+ * @hdr: header to be used
+ * @m: the data to be appended
+ * @mss: max allowable size of buffer
+ * @dlen: size of data to be appended
+ * @txq: queue to appand to
+ * Returns the number og 1k blocks appended or errno value
+ */
+int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen,
+		    int mss, struct sk_buff_head *txq)
+{
+	struct sk_buff *skb, *prev;
+	int accounted, total, curr;
+	int mlen, cpy, rem = dlen;
+	struct tipc_msg *hdr;
+
+	skb = skb_peek_tail(txq);
+	accounted = skb ? msg_blocks(buf_msg(skb)) : 0;
+	total = accounted;
+
+	while (rem) {
+		if (!skb || skb->len >= mss) {
+			prev = skb;
+			skb = tipc_buf_acquire(mss, GFP_KERNEL);
+			if (unlikely(!skb))
+				return -ENOMEM;
+			skb_orphan(skb);
+			skb_trim(skb, MIN_H_SIZE);
+			hdr = buf_msg(skb);
+			skb_copy_to_linear_data(skb, _hdr, MIN_H_SIZE);
+			msg_set_hdr_sz(hdr, MIN_H_SIZE);
+			msg_set_size(hdr, MIN_H_SIZE);
+			__skb_queue_tail(txq, skb);
+			total += 1;
+			if (prev)
+				msg_set_ack_required(buf_msg(prev), 0);
+			msg_set_ack_required(hdr, 1);
+		}
+		hdr = buf_msg(skb);
+		curr = msg_blocks(hdr);
+		mlen = msg_size(hdr);
+		cpy = min_t(int, rem, mss - mlen);
+		if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter))
+			return -EFAULT;
+		msg_set_size(hdr, mlen + cpy);
+		skb_put(skb, cpy);
+		rem -= cpy;
+		total += msg_blocks(hdr) - curr;
+	}
+	return total - accounted;
+}
+
 /* tipc_msg_validate - validate basic format of received message
  *
  * This routine ensures a TIPC message has an acceptable header, and at least
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 2d7cb66a6912..0435dda4b90c 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -290,6 +290,16 @@ static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d)
 	msg_set_bits(m, 0, 18, 1, d);
 }
 
+static inline int msg_ack_required(struct tipc_msg *m)
+{
+	return msg_bits(m, 0, 18, 1);
+}
+
+static inline void msg_set_ack_required(struct tipc_msg *m, u32 d)
+{
+	msg_set_bits(m, 0, 18, 1, d);
+}
+
 static inline bool msg_is_rcast(struct tipc_msg *m)
 {
 	return msg_bits(m, 0, 18, 0x1);
@@ -1079,6 +1089,8 @@ int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr,
 		      int pktmax, struct sk_buff_head *frags);
 int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 		   int offset, int dsz, int mtu, struct sk_buff_head *list);
+int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen,
+		    int mss, struct sk_buff_head *txq);
 bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
 bool tipc_msg_assemble(struct sk_buff_head *list);
 bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq);
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 30563c4f35d5..c39cd861c07d 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -54,7 +54,8 @@ enum {
 	TIPC_LINK_PROTO_SEQNO = (1 << 6),
 	TIPC_MCAST_RBCTL      = (1 << 7),
 	TIPC_GAP_ACK_BLOCK    = (1 << 8),
-	TIPC_TUNNEL_ENHANCED  = (1 << 9)
+	TIPC_TUNNEL_ENHANCED  = (1 << 9),
+	TIPC_NAGLE            = (1 << 10)
 };
 
 #define TIPC_NODE_CAPABILITIES (TIPC_SYN_BIT           |  \
@@ -66,7 +67,9 @@ enum {
 				TIPC_LINK_PROTO_SEQNO  |   \
 				TIPC_MCAST_RBCTL       |   \
 				TIPC_GAP_ACK_BLOCK     |   \
-				TIPC_TUNNEL_ENHANCED)
+				TIPC_TUNNEL_ENHANCED   |   \
+				TIPC_NAGLE)
+
 #define INVALID_BEARER_ID -1
 
 void tipc_node_stop(struct net *net);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 2bcacd6022d5..3e99a122e321 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -75,6 +75,7 @@ struct sockaddr_pair {
  * @conn_instance: TIPC instance used when connection was established
  * @published: non-zero if port has one or more associated names
  * @max_pkt: maximum packet size "hint" used when building messages sent by port
+ * @maxnagle: maximum size of msg which can be subject to nagle
  * @portid: unique port identity in TIPC socket hash table
  * @phdr: preformatted message header used when sending messages
  * #cong_links: list of congested links
@@ -97,6 +98,7 @@ struct tipc_sock {
 	u32 conn_instance;
 	int published;
 	u32 max_pkt;
+	u32 maxnagle;
 	u32 portid;
 	struct tipc_msg phdr;
 	struct list_head cong_links;
@@ -116,6 +118,10 @@ struct tipc_sock {
 	struct tipc_mc_method mc_method;
 	struct rcu_head rcu;
 	struct tipc_group *group;
+	u32 oneway;
+	u16 snd_backlog;
+	bool expect_ack;
+	bool nodelay;
 	bool group_is_open;
 };
 
@@ -137,6 +143,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk);
 static void tipc_sk_remove(struct tipc_sock *tsk);
 static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz);
 static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz);
+static void tipc_sk_push_backlog(struct tipc_sock *tsk);
 
 static const struct proto_ops packet_ops;
 static const struct proto_ops stream_ops;
@@ -227,6 +234,26 @@ static u16 tsk_inc(struct tipc_sock *tsk, int msglen)
 	return 1;
 }
 
+/* tsk_set_nagle - enable/disable nagle property by manipulating maxnagle
+ */
+static void tsk_set_nagle(struct tipc_sock *tsk)
+{
+	struct sock *sk = &tsk->sk;
+
+	tsk->maxnagle = 0;
+	if (sk->sk_type != SOCK_STREAM)
+		return;
+	if (tsk->nodelay)
+		return;
+	if (!(tsk->peer_caps & TIPC_NAGLE))
+		return;
+	/* Limit node local buffer size to avoid receive queue overflow */
+	if (tsk->max_pkt == MAX_MSG_SIZE)
+		tsk->maxnagle = 1500;
+	else
+		tsk->maxnagle = tsk->max_pkt;
+}
+
 /**
  * tsk_advance_rx_queue - discard first buffer in socket receive queue
  *
@@ -446,6 +473,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 
 	tsk = tipc_sk(sk);
 	tsk->max_pkt = MAX_PKT_DEFAULT;
+	tsk->maxnagle = 0;
 	INIT_LIST_HEAD(&tsk->publications);
 	INIT_LIST_HEAD(&tsk->cong_links);
 	msg = &tsk->phdr;
@@ -512,8 +540,12 @@ static void __tipc_shutdown(struct socket *sock, int error)
 	tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
 					    !tsk_conn_cong(tsk)));
 
-	/* Remove any pending SYN message */
-	__skb_queue_purge(&sk->sk_write_queue);
+	/* Push out unsent messages or remove if pending SYN */
+	skb = skb_peek(&sk->sk_write_queue);
+	if (skb && !msg_is_syn(buf_msg(skb)))
+		tipc_sk_push_backlog(tsk);
+	else
+		__skb_queue_purge(&sk->sk_write_queue);
 
 	/* Reject all unreceived messages, except on an active connection
 	 * (which disconnects locally & sends a 'FIN+' to peer).
@@ -1208,6 +1240,27 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
 	tipc_sk_rcv(net, inputq);
 }
 
+/* tipc_sk_push_backlog(): send accumulated buffers in socket write queue
+ *                         when socket is in Nagle mode
+ */
+static void tipc_sk_push_backlog(struct tipc_sock *tsk)
+{
+	struct sk_buff_head *txq = &tsk->sk.sk_write_queue;
+	struct net *net = sock_net(&tsk->sk);
+	u32 dnode = tsk_peer_node(tsk);
+	int rc;
+
+	if (skb_queue_empty(txq) || tsk->cong_link_cnt)
+		return;
+
+	tsk->snt_unacked += tsk->snd_backlog;
+	tsk->snd_backlog = 0;
+	tsk->expect_ack = true;
+	rc = tipc_node_xmit(net, txq, dnode, tsk->portid);
+	if (rc == -ELINKCONG)
+		tsk->cong_link_cnt = 1;
+}
+
 /**
  * tipc_sk_conn_proto_rcv - receive a connection mng protocol message
  * @tsk: receiving socket
@@ -1221,7 +1274,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
 	u32 onode = tsk_own_node(tsk);
 	struct sock *sk = &tsk->sk;
 	int mtyp = msg_type(hdr);
-	bool conn_cong;
+	bool was_cong;
 
 	/* Ignore if connection cannot be validated: */
 	if (!tsk_peer_msg(tsk, hdr)) {
@@ -1254,11 +1307,13 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb,
 			__skb_queue_tail(xmitq, skb);
 		return;
 	} else if (mtyp == CONN_ACK) {
-		conn_cong = tsk_conn_cong(tsk);
+		was_cong = tsk_conn_cong(tsk);
+		tsk->expect_ack = false;
+		tipc_sk_push_backlog(tsk);
 		tsk->snt_unacked -= msg_conn_ack(hdr);
 		if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
 			tsk->snd_win = msg_adv_win(hdr);
-		if (conn_cong)
+		if (was_cong && !tsk_conn_cong(tsk))
 			sk->sk_write_space(sk);
 	} else if (mtyp != CONN_PROBE_REPLY) {
 		pr_warn("Received unknown CONN_PROTO msg\n");
@@ -1437,15 +1492,15 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
 	struct sock *sk = sock->sk;
 	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
 	long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
+	struct sk_buff_head *txq = &sk->sk_write_queue;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct net *net = sock_net(sk);
-	struct sk_buff_head pkts;
 	u32 dnode = tsk_peer_node(tsk);
+	int maxnagle = tsk->maxnagle;
+	int maxpkt = tsk->max_pkt;
 	int send, sent = 0;
-	int rc = 0;
-
-	__skb_queue_head_init(&pkts);
+	int blocks, rc = 0;
 
 	if (unlikely(dlen > INT_MAX))
 		return -EMSGSIZE;
@@ -1467,21 +1522,35 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen)
 					 tipc_sk_connected(sk)));
 		if (unlikely(rc))
 			break;
-
 		send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
-		rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts);
-		if (unlikely(rc != send))
-			break;
-
-		trace_tipc_sk_sendstream(sk, skb_peek(&pkts),
+		blocks = tsk->snd_backlog;
+		if (tsk->oneway++ >= 4 && send <= maxnagle) {
+			rc = tipc_msg_append(hdr, m, send, maxnagle, txq);
+			if (unlikely(rc < 0))
+				break;
+			blocks += rc;
+			if (blocks <= 64 && tsk->expect_ack) {
+				tsk->snd_backlog = blocks;
+				sent += send;
+				break;
+			}
+			tsk->expect_ack = true;
+		} else {
+			rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq);
+			if (unlikely(rc != send))
+				break;
+			blocks += tsk_inc(tsk, send + MIN_H_SIZE);
+		}
+		trace_tipc_sk_sendstream(sk, skb_peek(txq),
 					 TIPC_DUMP_SK_SNDQ, " ");
-		rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
+		rc = tipc_node_xmit(net, txq, dnode, tsk->portid);
 		if (unlikely(rc == -ELINKCONG)) {
 			tsk->cong_link_cnt = 1;
 			rc = 0;
 		}
 		if (likely(!rc)) {
-			tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE);
+			tsk->snt_unacked += blocks;
+			tsk->snd_backlog = 0;
 			sent += send;
 		}
 	} while (sent < dlen && !rc);
@@ -1528,6 +1597,7 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port,
 	tipc_node_add_conn(net, peer_node, tsk->portid, peer_port);
 	tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid, true);
 	tsk->peer_caps = tipc_node_get_capabilities(net, peer_node);
+	tsk_set_nagle(tsk);
 	__skb_queue_purge(&sk->sk_write_queue);
 	if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL)
 		return;
@@ -1848,6 +1918,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m,
 	bool peek = flags & MSG_PEEK;
 	int offset, required, copy, copied = 0;
 	int hlen, dlen, err, rc;
+	bool ack = false;
 	long timeout;
 
 	/* Catch invalid receive attempts */
@@ -1892,6 +1963,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m,
 
 		/* Copy data if msg ok, otherwise return error/partial data */
 		if (likely(!err)) {
+			ack = msg_ack_required(hdr);
 			offset = skb_cb->bytes_read;
 			copy = min_t(int, dlen - offset, buflen - copied);
 			rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy);
@@ -1919,7 +1991,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m,
 
 		/* Send connection flow control advertisement when applicable */
 		tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen);
-		if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE))
+		if (ack || tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)
 			tipc_sk_send_ack(tsk);
 
 		/* Exit if all requested data or FIN/error received */
@@ -1990,6 +2062,7 @@ static void tipc_sk_proto_rcv(struct sock *sk,
 		smp_wmb();
 		tsk->cong_link_cnt--;
 		wakeup = true;
+		tipc_sk_push_backlog(tsk);
 		break;
 	case GROUP_PROTOCOL:
 		tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq);
@@ -2029,6 +2102,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
 
 	if (unlikely(msg_mcast(hdr)))
 		return false;
+	tsk->oneway = 0;
 
 	switch (sk->sk_state) {
 	case TIPC_CONNECTING:
@@ -2074,6 +2148,8 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb)
 			return true;
 		return false;
 	case TIPC_ESTABLISHED:
+		if (!skb_queue_empty(&sk->sk_write_queue))
+			tipc_sk_push_backlog(tsk);
 		/* Accept only connection-based messages sent by peer */
 		if (likely(con_msg && !err && pport == oport && pnode == onode))
 			return true;
@@ -2959,6 +3035,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 	case TIPC_SRC_DROPPABLE:
 	case TIPC_DEST_DROPPABLE:
 	case TIPC_CONN_TIMEOUT:
+	case TIPC_NODELAY:
 		if (ol < sizeof(value))
 			return -EINVAL;
 		if (get_user(value, (u32 __user *)ov))
@@ -3007,6 +3084,10 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt,
 	case TIPC_GROUP_LEAVE:
 		res = tipc_sk_leave(tsk);
 		break;
+	case TIPC_NODELAY:
+		tsk->nodelay = !!value;
+		tsk_set_nagle(tsk);
+		break;
 	default:
 		res = -EINVAL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 98298e6ca6d5908f96e529e70a254a4d5bf754e7 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Tue, 29 Oct 2019 14:50:50 +0100
Subject: flow_dissector: add meaningful comments

Documents two piece of code which can't be understood at a glance.

Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 1 +
 net/core/flow_dissector.c    | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 90bd210be060..7747af3cc500 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -282,6 +282,7 @@ struct flow_keys {
 	struct flow_dissector_key_vlan cvlan;
 	struct flow_dissector_key_keyid keyid;
 	struct flow_dissector_key_ports ports;
+	/* 'addrs' must be the last member */
 	struct flow_dissector_key_addrs addrs;
 };
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index dbf502c18656..bc22b384ac6c 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1408,6 +1408,9 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
 {
 	size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
 	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
+	/* flow.addrs MUST be the last member in struct flow_keys because
+	 * different L3 protocols have different address length
+	 */
 	BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
 		     sizeof(*flow) - sizeof(flow->addrs));
 
@@ -1455,6 +1458,9 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow)
 }
 EXPORT_SYMBOL(flow_get_u32_dst);
 
+/* Sort the source and destination IP (and the ports if the IP are the same),
+ * to have consistent hash within the two directions
+ */
 static inline void __flow_hash_consistentify(struct flow_keys *keys)
 {
 	int addr_diff, i;
-- 
cgit v1.2.3-59-g8ed1b


From 5dec597e5cd0f4c3000d120508efa64157d5bd7a Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Tue, 29 Oct 2019 14:50:52 +0100
Subject: flow_dissector: extract more ICMP information

The ICMP flow dissector currently parses only the Type and Code fields.
Some ICMP packets (echo, timestamp) have a 16 bit Identifier field which
is used to correlate packets.
Add such field in flow_dissector_key_icmp and replace skb_flow_get_be16()
with a more complex function which populate this field.

Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 19 +++++++-----
 net/core/flow_dissector.c    | 74 ++++++++++++++++++++++++++++++--------------
 2 files changed, 61 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 7747af3cc500..f8541d018848 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -6,6 +6,8 @@
 #include <linux/in6.h>
 #include <uapi/linux/if_ether.h>
 
+struct sk_buff;
+
 /**
  * struct flow_dissector_key_control:
  * @thoff: Transport header offset
@@ -156,19 +158,16 @@ struct flow_dissector_key_ports {
 
 /**
  * flow_dissector_key_icmp:
- *	@ports: type and code of ICMP header
- *		icmp: ICMP type (high) and code (low)
  *		type: ICMP type
  *		code: ICMP code
+ *		id:   session identifier
  */
 struct flow_dissector_key_icmp {
-	union {
-		__be16 icmp;
-		struct {
-			u8 type;
-			u8 code;
-		};
+	struct {
+		u8 type;
+		u8 code;
 	};
+	u16 id;
 };
 
 /**
@@ -282,6 +281,7 @@ struct flow_keys {
 	struct flow_dissector_key_vlan cvlan;
 	struct flow_dissector_key_keyid keyid;
 	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_icmp icmp;
 	/* 'addrs' must be the last member */
 	struct flow_dissector_key_addrs addrs;
 };
@@ -316,6 +316,9 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys)
 }
 
 u32 flow_hash_from_keys(struct flow_keys *keys);
+void skb_flow_get_icmp_tci(const struct sk_buff *skb,
+			   struct flow_dissector_key_icmp *key_icmp,
+			   void *data, int thoff, int hlen);
 
 static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector,
 				      enum flow_dissector_key_id key_id)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0fb721976f3d..0807df0bde02 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -178,27 +178,6 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
 	mutex_unlock(&flow_dissector_mutex);
 	return 0;
 }
-/**
- * skb_flow_get_be16 - extract be16 entity
- * @skb: sk_buff to extract from
- * @poff: offset to extract at
- * @data: raw buffer pointer to the packet
- * @hlen: packet header length
- *
- * The function will try to retrieve a be32 entity at
- * offset poff
- */
-static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
-				void *data, int hlen)
-{
-	__be16 *u, _u;
-
-	u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
-	if (u)
-		return *u;
-
-	return 0;
-}
 
 /**
  * __skb_flow_get_ports - extract the upper layer ports and return them
@@ -234,8 +213,54 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
-/* If FLOW_DISSECTOR_KEY_ICMP is set, get the Type and Code from an ICMP packet
- * using skb_flow_get_be16().
+static bool icmp_has_id(u8 type)
+{
+	switch (type) {
+	case ICMP_ECHO:
+	case ICMP_ECHOREPLY:
+	case ICMP_TIMESTAMP:
+	case ICMP_TIMESTAMPREPLY:
+	case ICMPV6_ECHO_REQUEST:
+	case ICMPV6_ECHO_REPLY:
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields
+ * @skb: sk_buff to extract from
+ * @key_icmp: struct flow_dissector_key_icmp to fill
+ * @data: raw buffer pointer to the packet
+ * @toff: offset to extract at
+ * @hlen: packet header length
+ */
+void skb_flow_get_icmp_tci(const struct sk_buff *skb,
+			   struct flow_dissector_key_icmp *key_icmp,
+			   void *data, int thoff, int hlen)
+{
+	struct icmphdr *ih, _ih;
+
+	ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih);
+	if (!ih)
+		return;
+
+	key_icmp->type = ih->type;
+	key_icmp->code = ih->code;
+
+	/* As we use 0 to signal that the Id field is not present,
+	 * avoid confusion with packets without such field
+	 */
+	if (icmp_has_id(ih->type))
+		key_icmp->id = ih->un.echo.id ? : 1;
+	else
+		key_icmp->id = 0;
+}
+EXPORT_SYMBOL(skb_flow_get_icmp_tci);
+
+/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet
+ * using skb_flow_get_icmp_tci().
  */
 static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
 				    struct flow_dissector *flow_dissector,
@@ -250,7 +275,8 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
 	key_icmp = skb_flow_dissector_target(flow_dissector,
 					     FLOW_DISSECTOR_KEY_ICMP,
 					     target_container);
-	key_icmp->icmp = skb_flow_get_be16(skb, thoff, data, hlen);
+
+	skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
 }
 
 void skb_flow_dissect_meta(const struct sk_buff *skb,
-- 
cgit v1.2.3-59-g8ed1b


From c8ecebd04cbb6badb46d42fe54282e7883ed63cc Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Wed, 30 Oct 2019 16:09:00 +0200
Subject: net: sched: extract common action counters update code into function

Currently, all implementations of tc_action_ops->stats_update() callback
have almost exactly the same implementation of counters update
code (besides gact which also updates drop counter). In order to simplify
support for using both percpu-allocated and regular action counters
depending on run-time flag in following patches, extract action counters
update code into standalone function in act API.

This commit doesn't change functionality.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h  |  2 ++
 net/sched/act_api.c    | 14 ++++++++++++++
 net/sched/act_ct.c     |  6 +-----
 net/sched/act_gact.c   | 10 +---------
 net/sched/act_mirred.c |  5 +----
 net/sched/act_police.c |  5 +----
 net/sched/act_vlan.c   |  5 +----
 7 files changed, 21 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index b18c699681ca..f6f66c692385 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -186,6 +186,8 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
 		    int ref);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
+void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets,
+			     bool drop, bool hw);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
 
 int tcf_action_check_ctrlact(int action, struct tcf_proto *tp,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 69d4676a402f..0638afa2fc3f 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -989,6 +989,20 @@ err:
 	return err;
 }
 
+void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets,
+			     bool drop, bool hw)
+{
+	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+
+	if (drop)
+		this_cpu_ptr(a->cpu_qstats)->drops += packets;
+
+	if (hw)
+		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+				   bytes, packets);
+}
+EXPORT_SYMBOL(tcf_action_update_stats);
+
 int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
 			  int compat_mode)
 {
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index fcc46025e790..ba76857754e5 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -905,11 +905,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 {
 	struct tcf_ct *c = to_ct(a);
 
-	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-
-	if (hw)
-		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
-				   bytes, packets);
+	tcf_action_update_stats(a, bytes, packets, false, hw);
 	c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
 }
 
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 324f1d1f6d47..569cec63d4c3 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -177,15 +177,7 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	int action = READ_ONCE(gact->tcf_action);
 	struct tcf_t *tm = &gact->tcf_tm;
 
-	_bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes,
-			   packets);
-	if (action == TC_ACT_SHOT)
-		this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
-
-	if (hw)
-		_bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
-				   bytes, packets);
-
+	tcf_action_update_stats(a, bytes, packets, action == TC_ACT_SHOT, hw);
 	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 08923b21e566..621686a6b5be 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -318,10 +318,7 @@ static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	struct tcf_mirred *m = to_mirred(a);
 	struct tcf_t *tm = &m->tcf_tm;
 
-	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-	if (hw)
-		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
-				   bytes, packets);
+	tcf_action_update_stats(a, bytes, packets, false, hw);
 	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 981a9eca0c52..51d34b1a61d5 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -294,10 +294,7 @@ static void tcf_police_stats_update(struct tc_action *a,
 	struct tcf_police *police = to_police(a);
 	struct tcf_t *tm = &police->tcf_tm;
 
-	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-	if (hw)
-		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
-				   bytes, packets);
+	tcf_action_update_stats(a, bytes, packets, false, hw);
 	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 08aaf719a70f..9e68edb22e53 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -307,10 +307,7 @@ static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	struct tcf_vlan *v = to_vlan(a);
 	struct tcf_t *tm = &v->tcf_tm;
 
-	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
-	if (hw)
-		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
-				   bytes, packets);
+	tcf_action_update_stats(a, bytes, packets, false, hw);
 	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5e1ad95b630e652d3467d1fd1f0b5e5ea2c441e2 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Wed, 30 Oct 2019 16:09:01 +0200
Subject: net: sched: extract bstats update code into function

Extract common code that increments cpu_bstats counter into standalone act
API function. Change hardware offloaded actions that use percpu counter
allocation to use the new function instead of incrementing cpu_bstats
directly.

This commit doesn't change functionality.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h      | 7 +++++++
 net/sched/act_csum.c       | 2 +-
 net/sched/act_ct.c         | 2 +-
 net/sched/act_gact.c       | 2 +-
 net/sched/act_mirred.c     | 2 +-
 net/sched/act_tunnel_key.c | 2 +-
 net/sched/act_vlan.c       | 2 +-
 7 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index f6f66c692385..9a32853f77f9 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -186,6 +186,13 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
 		    int ref);
 int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
+
+static inline void tcf_action_update_bstats(struct tc_action *a,
+					    struct sk_buff *skb)
+{
+	bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+}
+
 void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets,
 			     bool drop, bool hw);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index d3cfad88dc3a..69747b1860aa 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -580,7 +580,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a,
 	params = rcu_dereference_bh(p->params);
 
 	tcf_lastuse_update(&p->tcf_tm);
-	bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
+	tcf_action_update_bstats(&p->common, skb);
 
 	action = READ_ONCE(p->tcf_action);
 	if (unlikely(action == TC_ACT_SHOT))
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index ba76857754e5..f9779907dcf7 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -465,7 +465,7 @@ out_push:
 	skb_push_rcsum(skb, nh_ofs);
 
 out:
-	bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+	tcf_action_update_bstats(&c->common, skb);
 	return retval;
 
 drop:
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 569cec63d4c3..a7e3d5621608 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -161,7 +161,7 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
 		action = gact_rand[ptype](gact);
 	}
 #endif
-	bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
+	tcf_action_update_bstats(&gact->common, skb);
 	if (action == TC_ACT_SHOT)
 		qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 621686a6b5be..e5216f80883b 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -231,7 +231,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
 	}
 
 	tcf_lastuse_update(&m->tcf_tm);
-	bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
+	tcf_action_update_bstats(&m->common, skb);
 
 	m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
 	m_eaction = READ_ONCE(m->tcfm_eaction);
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 2f83a79f76aa..9ab2d3b4a9fc 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -31,7 +31,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
 	params = rcu_dereference_bh(t->params);
 
 	tcf_lastuse_update(&t->tcf_tm);
-	bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+	tcf_action_update_bstats(&t->common, skb);
 	action = READ_ONCE(t->tcf_action);
 
 	switch (params->tcft_action) {
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 9e68edb22e53..f6dccaa29239 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -29,7 +29,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
 	u16 tci;
 
 	tcf_lastuse_update(&v->tcf_tm);
-	bstats_cpu_update(this_cpu_ptr(v->common.cpu_bstats), skb);
+	tcf_action_update_bstats(&v->common, skb);
 
 	/* Ensure 'data' points at mac_header prior calling vlan manipulating
 	 * functions.
-- 
cgit v1.2.3-59-g8ed1b


From 26b537a88ca5b7399c7ab0656e06dbd9da9513c1 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Wed, 30 Oct 2019 16:09:02 +0200
Subject: net: sched: extract qstats update code into functions

Extract common code that increments cpu_qstats counters into standalone act
API functions. Change hardware offloaded actions that use percpu counter
allocation to use the new functions instead of accessing cpu_qstats
directly.

This commit doesn't change functionality.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h  | 16 ++++++++++++++++
 net/sched/act_csum.c   |  2 +-
 net/sched/act_ct.c     |  2 +-
 net/sched/act_gact.c   |  2 +-
 net/sched/act_mirred.c |  2 +-
 net/sched/act_vlan.c   |  2 +-
 6 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9a32853f77f9..8d6861ce205b 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -193,6 +193,22 @@ static inline void tcf_action_update_bstats(struct tc_action *a,
 	bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
 }
 
+static inline struct gnet_stats_queue *
+tcf_action_get_qstats(struct tc_action *a)
+{
+	return this_cpu_ptr(a->cpu_qstats);
+}
+
+static inline void tcf_action_inc_drop_qstats(struct tc_action *a)
+{
+	qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+}
+
+static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a)
+{
+	qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats));
+}
+
 void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets,
 			     bool drop, bool hw);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 69747b1860aa..bc909cf72257 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -624,7 +624,7 @@ out:
 	return action;
 
 drop:
-	qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
+	tcf_action_inc_drop_qstats(&p->common);
 	action = TC_ACT_SHOT;
 	goto out;
 }
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index f9779907dcf7..eabae2227e13 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -469,7 +469,7 @@ out:
 	return retval;
 
 drop:
-	qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+	tcf_action_inc_drop_qstats(&c->common);
 	return TC_ACT_SHOT;
 }
 
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index a7e3d5621608..221f0c2e26b1 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -163,7 +163,7 @@ static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a,
 #endif
 	tcf_action_update_bstats(&gact->common, skb);
 	if (action == TC_ACT_SHOT)
-		qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
+		tcf_action_inc_drop_qstats(&gact->common);
 
 	tcf_lastuse_update(&gact->tcf_tm);
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e5216f80883b..49a378a5b4fa 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -303,7 +303,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
 
 	if (err) {
 out:
-		qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats));
+		tcf_action_inc_overlimit_qstats(&m->common);
 		if (tcf_mirred_is_act_redirect(m_eaction))
 			retval = TC_ACT_SHOT;
 	}
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index f6dccaa29239..ffa0f431aa84 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -88,7 +88,7 @@ out:
 	return action;
 
 drop:
-	qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+	tcf_action_inc_drop_qstats(&v->common);
 	return TC_ACT_SHOT;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From ef816f3c49c1c404ababc50e10d4cbe5109da678 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Wed, 30 Oct 2019 16:09:03 +0200
Subject: net: sched: don't expose action qstats to skb_tc_reinsert()

Previous commit introduced helper function for updating qstats and
refactored set of actions to use the helpers, instead of modifying qstats
directly. However, one of the affected action exposes its qstats to
skb_tc_reinsert(), which then modifies it.

Refactor skb_tc_reinsert() to return integer error code and don't increment
overlimit qstats in case of error, and use the returned error code in
tcf_mirred_act() to manually increment the overlimit counter with new
helper function.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 12 ++----------
 net/sched/act_mirred.c    |  4 ++--
 2 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 637548d54b3e..a8b0a9a4c686 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1286,17 +1286,9 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
 void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
 			  struct mini_Qdisc __rcu **p_miniq);
 
-static inline void skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res)
+static inline int skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res)
 {
-	struct gnet_stats_queue *stats = res->qstats;
-	int ret;
-
-	if (res->ingress)
-		ret = netif_receive_skb(skb);
-	else
-		ret = dev_queue_xmit(skb);
-	if (ret && stats)
-		qstats_overlimit_inc(res->qstats);
+	return res->ingress ? netif_receive_skb(skb) : dev_queue_xmit(skb);
 }
 
 #endif
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 49a378a5b4fa..ae1129aaf3c0 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -289,8 +289,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
 		/* let's the caller reinsert the packet, if possible */
 		if (use_reinsert) {
 			res->ingress = want_ingress;
-			res->qstats = this_cpu_ptr(m->common.cpu_qstats);
-			skb_tc_reinsert(skb, res);
+			if (skb_tc_reinsert(skb, res))
+				tcf_action_inc_overlimit_qstats(&m->common);
 			__this_cpu_dec(mirred_rec_level);
 			return TC_ACT_CONSUMED;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 5e174d5e73dfbfb2c4bc4804f58f2f2aa34c9281 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Wed, 30 Oct 2019 16:09:04 +0200
Subject: net: sched: modify stats helper functions to support regular stats

Modify stats update helper functions introduced in previous patches in this
series to fallback to regular tc_action->tcfa_{b|q}stats if cpu stats are
not allocated for the action argument. If regular non-percpu allocated
counters are in use, then obtain action tcfa_lock while modifying them.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 30 +++++++++++++++++++++---------
 net/sched/act_api.c   | 19 ++++++++++++++-----
 2 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8d6861ce205b..a56477051dae 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -190,23 +190,35 @@ int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 static inline void tcf_action_update_bstats(struct tc_action *a,
 					    struct sk_buff *skb)
 {
-	bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
-}
-
-static inline struct gnet_stats_queue *
-tcf_action_get_qstats(struct tc_action *a)
-{
-	return this_cpu_ptr(a->cpu_qstats);
+	if (likely(a->cpu_bstats)) {
+		bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb);
+		return;
+	}
+	spin_lock(&a->tcfa_lock);
+	bstats_update(&a->tcfa_bstats, skb);
+	spin_unlock(&a->tcfa_lock);
 }
 
 static inline void tcf_action_inc_drop_qstats(struct tc_action *a)
 {
-	qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+	if (likely(a->cpu_qstats)) {
+		qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
+		return;
+	}
+	spin_lock(&a->tcfa_lock);
+	qstats_drop_inc(&a->tcfa_qstats);
+	spin_unlock(&a->tcfa_lock);
 }
 
 static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a)
 {
-	qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats));
+	if (likely(a->cpu_qstats)) {
+		qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats));
+		return;
+	}
+	spin_lock(&a->tcfa_lock);
+	qstats_overlimit_inc(&a->tcfa_qstats);
+	spin_unlock(&a->tcfa_lock);
 }
 
 void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 0638afa2fc3f..f85b88da5216 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -992,14 +992,23 @@ err:
 void tcf_action_update_stats(struct tc_action *a, u64 bytes, u32 packets,
 			     bool drop, bool hw)
 {
-	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+	if (a->cpu_bstats) {
+		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
 
-	if (drop)
-		this_cpu_ptr(a->cpu_qstats)->drops += packets;
+		if (drop)
+			this_cpu_ptr(a->cpu_qstats)->drops += packets;
+
+		if (hw)
+			_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+					   bytes, packets);
+		return;
+	}
 
+	_bstats_update(&a->tcfa_bstats, bytes, packets);
+	if (drop)
+		a->tcfa_qstats.drops += packets;
 	if (hw)
-		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
-				   bytes, packets);
+		_bstats_update(&a->tcfa_bstats_hw, bytes, packets);
 }
 EXPORT_SYMBOL(tcf_action_update_stats);
 
-- 
cgit v1.2.3-59-g8ed1b


From abbb0d33632ce931ca9c814813ee131351f6b92f Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Wed, 30 Oct 2019 16:09:05 +0200
Subject: net: sched: extend TCA_ACT space with TCA_ACT_FLAGS

Extend TCA_ACT space with nla_bitfield32 flags. Add
TCA_ACT_FLAGS_NO_PERCPU_STATS as the only allowed flag. Parse the flags in
tcf_action_init_1() and pass resulting value as additional argument to
a_o->init().

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h        |  2 +-
 include/uapi/linux/pkt_cls.h |  5 +++++
 net/sched/act_api.c          | 10 ++++++++--
 net/sched/act_bpf.c          |  3 ++-
 net/sched/act_connmark.c     |  2 +-
 net/sched/act_csum.c         |  2 +-
 net/sched/act_ct.c           |  2 +-
 net/sched/act_ctinfo.c       |  2 +-
 net/sched/act_gact.c         |  3 ++-
 net/sched/act_ife.c          |  3 ++-
 net/sched/act_ipt.c          | 10 +++++-----
 net/sched/act_mirred.c       |  2 +-
 net/sched/act_mpls.c         |  3 ++-
 net/sched/act_nat.c          |  2 +-
 net/sched/act_pedit.c        |  3 ++-
 net/sched/act_police.c       |  2 +-
 net/sched/act_sample.c       |  2 +-
 net/sched/act_simple.c       |  3 ++-
 net/sched/act_skbedit.c      |  2 +-
 net/sched/act_skbmod.c       |  2 +-
 net/sched/act_tunnel_key.c   |  2 +-
 net/sched/act_vlan.c         |  3 ++-
 22 files changed, 44 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index a56477051dae..85e95c44c7f9 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -94,7 +94,7 @@ struct tc_action_ops {
 	int     (*init)(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **act, int ovr,
 			int bind, bool rtnl_held, struct tcf_proto *tp,
-			struct netlink_ext_ack *extack);
+			u32 flags, struct netlink_ext_ack *extack);
 	int     (*walk)(struct net *, struct sk_buff *,
 			struct netlink_callback *, int,
 			const struct tc_action_ops *,
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index a6aa466fac9e..c6ad22f76ede 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -16,9 +16,14 @@ enum {
 	TCA_ACT_STATS,
 	TCA_ACT_PAD,
 	TCA_ACT_COOKIE,
+	TCA_ACT_FLAGS,
 	__TCA_ACT_MAX
 };
 
+#define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for
+					 * actions stats.
+					 */
+
 #define TCA_ACT_MAX __TCA_ACT_MAX
 #define TCA_OLD_COMPAT (TCA_ACT_MAX+1)
 #define TCA_ACT_MAX_PRIO 32
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f85b88da5216..92c00207d5a1 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -831,12 +831,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
 	return c;
 }
 
+static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS;
 static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
 	[TCA_ACT_KIND]		= { .type = NLA_STRING },
 	[TCA_ACT_INDEX]		= { .type = NLA_U32 },
 	[TCA_ACT_COOKIE]	= { .type = NLA_BINARY,
 				    .len = TC_COOKIE_MAX_SIZE },
 	[TCA_ACT_OPTIONS]	= { .type = NLA_NESTED },
+	[TCA_ACT_FLAGS]		= { .type = NLA_BITFIELD32,
+				    .validation_data = &tca_act_flags_allowed },
 };
 
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
@@ -845,6 +848,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    bool rtnl_held,
 				    struct netlink_ext_ack *extack)
 {
+	struct nla_bitfield32 flags = { 0, 0 };
 	struct tc_action *a;
 	struct tc_action_ops *a_o;
 	struct tc_cookie *cookie = NULL;
@@ -876,6 +880,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				goto err_out;
 			}
 		}
+		if (tb[TCA_ACT_FLAGS])
+			flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
 	} else {
 		if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
 			NL_SET_ERR_MSG(extack, "TC action name too long");
@@ -914,10 +920,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 	/* backward compatibility for policer */
 	if (name == NULL)
 		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
-				rtnl_held, tp, extack);
+				rtnl_held, tp, flags.value, extack);
 	else
 		err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held,
-				tp, extack);
+				tp, flags.value, extack);
 	if (err < 0)
 		goto err_mod;
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 04b7bd4ec751..9e8cb43bc3fe 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -275,7 +275,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
 static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **act,
 			int replace, int bind, bool rtnl_held,
-			struct tcf_proto *tp, struct netlink_ext_ack *extack)
+			struct tcf_proto *tp, u32 flags,
+			struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 2b43cacf82af..2e0ec6f80458 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -94,7 +94,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
 static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 			     struct nlattr *est, struct tc_action **a,
 			     int ovr, int bind, bool rtnl_held,
-			     struct tcf_proto *tp,
+			     struct tcf_proto *tp, u32 flags,
 			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index bc909cf72257..66e54fada44c 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -43,7 +43,7 @@ static struct tc_action_ops act_csum_ops;
 static int tcf_csum_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a, int ovr,
 			 int bind, bool rtnl_held, struct tcf_proto *tp,
-			 struct netlink_ext_ack *extack)
+			 u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 	struct tcf_csum_params *params_new;
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index eabae2227e13..92ec0bdb0547 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -656,7 +656,7 @@ static int tcf_ct_fill_params(struct net *net,
 static int tcf_ct_init(struct net *net, struct nlattr *nla,
 		       struct nlattr *est, struct tc_action **a,
 		       int replace, int bind, bool rtnl_held,
-		       struct tcf_proto *tp,
+		       struct tcf_proto *tp, u32 flags,
 		       struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ct_net_id);
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 0dbcfd1dca7b..2205b2a934cc 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -153,7 +153,7 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
 static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
 			   int ovr, int bind, bool rtnl_held,
-			   struct tcf_proto *tp,
+			   struct tcf_proto *tp, u32 flags,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 221f0c2e26b1..c3dc89160f3a 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -53,7 +53,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
 static int tcf_gact_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind, bool rtnl_held,
-			 struct tcf_proto *tp, struct netlink_ext_ack *extack)
+			 struct tcf_proto *tp, u32 flags,
+			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 	struct nlattr *tb[TCA_GACT_MAX + 1];
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 3a31e241c647..f38d2a5fd608 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -465,7 +465,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
 static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **a,
 			int ovr, int bind, bool rtnl_held,
-			struct tcf_proto *tp, struct netlink_ext_ack *extack)
+			struct tcf_proto *tp, u32 flags,
+			struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 	struct nlattr *tb[TCA_IFE_MAX + 1];
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 214a03d405cf..fbab70787477 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -95,7 +95,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
 			  const struct tc_action_ops *ops, int ovr, int bind,
-			  struct tcf_proto *tp)
+			  struct tcf_proto *tp, u32 flags)
 {
 	struct tc_action_net *tn = net_generic(net, id);
 	struct nlattr *tb[TCA_IPT_MAX + 1];
@@ -205,19 +205,19 @@ err1:
 static int tcf_ipt_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **a, int ovr,
 			int bind, bool rtnl_held, struct tcf_proto *tp,
-			struct netlink_ext_ack *extack)
+			u32 flags, struct netlink_ext_ack *extack)
 {
 	return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
-			      bind, tp);
+			      bind, tp, flags);
 }
 
 static int tcf_xt_init(struct net *net, struct nlattr *nla,
 		       struct nlattr *est, struct tc_action **a, int ovr,
 		       int bind, bool unlocked, struct tcf_proto *tp,
-		       struct netlink_ext_ack *extack)
+		       u32 flags, struct netlink_ext_ack *extack)
 {
 	return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
-			      bind, tp);
+			      bind, tp, flags);
 }
 
 static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index ae1129aaf3c0..17ed19d6dff4 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -93,7 +93,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
 			   int ovr, int bind, bool rtnl_held,
 			   struct tcf_proto *tp,
-			   struct netlink_ext_ack *extack)
+			   u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 	struct nlattr *tb[TCA_MIRRED_MAX + 1];
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index 4cf6c553bb0b..efd7fe07141b 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -131,7 +131,8 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = {
 static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind, bool rtnl_held,
-			 struct tcf_proto *tp, struct netlink_ext_ack *extack)
+			 struct tcf_proto *tp, u32 flags,
+			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mpls_net_id);
 	struct nlattr *tb[TCA_MPLS_MAX + 1];
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index ea4c5359e7df..51d631cef92c 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -36,7 +36,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 			struct tc_action **a, int ovr, int bind,
 			bool rtnl_held,	struct tcf_proto *tp,
-			struct netlink_ext_ack *extack)
+			u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 	struct nlattr *tb[TCA_NAT_MAX + 1];
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index cdfaa79382a2..adf1cbd6ae46 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -137,7 +137,8 @@ nla_failure:
 static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
 			  int ovr, int bind, bool rtnl_held,
-			  struct tcf_proto *tp, struct netlink_ext_ack *extack)
+			  struct tcf_proto *tp, u32 flags,
+			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 51d34b1a61d5..7437b001f493 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -47,7 +47,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 static int tcf_police_init(struct net *net, struct nlattr *nla,
 			       struct nlattr *est, struct tc_action **a,
 			       int ovr, int bind, bool rtnl_held,
-			       struct tcf_proto *tp,
+			       struct tcf_proto *tp, u32 flags,
 			       struct netlink_ext_ack *extack)
 {
 	int ret = 0, tcfp_result = TC_ACT_OK, err, size;
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 514456a0b9a8..6f9a745c3095 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -36,7 +36,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
 static int tcf_sample_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a, int ovr,
 			   int bind, bool rtnl_held, struct tcf_proto *tp,
-			   struct netlink_ext_ack *extack)
+			   u32 flags, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
 	struct nlattr *tb[TCA_SAMPLE_MAX + 1];
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 6120e56117ca..b18890f3eb67 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -86,7 +86,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
 static int tcf_simp_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind, bool rtnl_held,
-			 struct tcf_proto *tp, struct netlink_ext_ack *extack)
+			 struct tcf_proto *tp, u32 flags,
+			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 	struct nlattr *tb[TCA_DEF_MAX + 1];
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6a8d3337c577..25f3b7b56bea 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -86,7 +86,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 			    struct nlattr *est, struct tc_action **a,
 			    int ovr, int bind, bool rtnl_held,
-			    struct tcf_proto *tp,
+			    struct tcf_proto *tp, u32 act_flags,
 			    struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 888437f97ba6..8e1dc0d6b4b0 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -79,7 +79,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
 static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
 			   int ovr, int bind, bool rtnl_held,
-			   struct tcf_proto *tp,
+			   struct tcf_proto *tp, u32 flags,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 9ab2d3b4a9fc..b25e5124f571 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -208,7 +208,7 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p)
 static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
 			   int ovr, int bind, bool rtnl_held,
-			   struct tcf_proto *tp,
+			   struct tcf_proto *tp, u32 act_flags,
 			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index ffa0f431aa84..4b4000338a09 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -102,7 +102,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
 static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
 			 int ovr, int bind, bool rtnl_held,
-			 struct tcf_proto *tp, struct netlink_ext_ack *extack)
+			 struct tcf_proto *tp, u32 flags,
+			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 	struct nlattr *tb[TCA_VLAN_MAX + 1];
-- 
cgit v1.2.3-59-g8ed1b


From e38226786022d2d8e5876ab7bc37e82b0eb57e65 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@mellanox.com>
Date: Wed, 30 Oct 2019 16:09:06 +0200
Subject: net: sched: update action implementations to support flags

Extend struct tc_action with new "tcfa_flags" field. Set the field in
tcf_idr_create() function and provide new helper
tcf_idr_create_from_flags() that derives 'cpustats' boolean from flags
value. Update individual hardware-offloaded actions init() to pass their
"flags" argument to new helper in order to skip percpu stats allocation
when user requested it through flags.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h      |  7 ++++++-
 net/sched/act_api.c        | 22 +++++++++++++++++++++-
 net/sched/act_bpf.c        |  2 +-
 net/sched/act_connmark.c   |  2 +-
 net/sched/act_csum.c       |  4 ++--
 net/sched/act_ct.c         |  4 ++--
 net/sched/act_ctinfo.c     |  2 +-
 net/sched/act_gact.c       |  4 ++--
 net/sched/act_ife.c        |  2 +-
 net/sched/act_ipt.c        |  2 +-
 net/sched/act_mirred.c     |  4 ++--
 net/sched/act_mpls.c       |  2 +-
 net/sched/act_nat.c        |  2 +-
 net/sched/act_pedit.c      |  2 +-
 net/sched/act_police.c     |  2 +-
 net/sched/act_sample.c     |  2 +-
 net/sched/act_simple.c     |  2 +-
 net/sched/act_skbedit.c    |  2 +-
 net/sched/act_skbmod.c     |  2 +-
 net/sched/act_tunnel_key.c |  5 +++--
 net/sched/act_vlan.c       |  4 ++--
 21 files changed, 53 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 85e95c44c7f9..0495bdc034d2 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -41,6 +41,7 @@ struct tc_action {
 	struct gnet_stats_queue __percpu *cpu_qstats;
 	struct tc_cookie	__rcu *act_cookie;
 	struct tcf_chain	__rcu *goto_chain;
+	u32			tcfa_flags;
 };
 #define tcf_index	common.tcfa_index
 #define tcf_refcnt	common.tcfa_refcnt
@@ -154,7 +155,11 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index);
 int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 		   struct tc_action **a, const struct tc_action_ops *ops,
-		   int bind, bool cpustats);
+		   int bind, bool cpustats, u32 flags);
+int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index,
+			      struct nlattr *est, struct tc_action **a,
+			      const struct tc_action_ops *ops, int bind,
+			      u32 flags);
 void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
 
 void tcf_idr_cleanup(struct tc_action_net *tn, u32 index);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 92c00207d5a1..6284c552e943 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -399,7 +399,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index)
 
 int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 		   struct tc_action **a, const struct tc_action_ops *ops,
-		   int bind, bool cpustats)
+		   int bind, bool cpustats, u32 flags)
 {
 	struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
 	struct tcf_idrinfo *idrinfo = tn->idrinfo;
@@ -427,6 +427,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 	p->tcfa_tm.install = jiffies;
 	p->tcfa_tm.lastuse = jiffies;
 	p->tcfa_tm.firstuse = 0;
+	p->tcfa_flags = flags;
 	if (est) {
 		err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats,
 					&p->tcfa_rate_est,
@@ -451,6 +452,17 @@ err1:
 }
 EXPORT_SYMBOL(tcf_idr_create);
 
+int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index,
+			      struct nlattr *est, struct tc_action **a,
+			      const struct tc_action_ops *ops, int bind,
+			      u32 flags)
+{
+	/* Set cpustats according to actions flags. */
+	return tcf_idr_create(tn, index, est, a, ops, bind,
+			      !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags);
+}
+EXPORT_SYMBOL(tcf_idr_create_from_flags);
+
 void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
 {
 	struct tcf_idrinfo *idrinfo = tn->idrinfo;
@@ -773,6 +785,14 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
 	}
 	rcu_read_unlock();
 
+	if (a->tcfa_flags) {
+		struct nla_bitfield32 flags = { a->tcfa_flags,
+						a->tcfa_flags, };
+
+		if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags))
+			goto nla_put_failure;
+	}
+
 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 9e8cb43bc3fe..46f47e58b3be 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -304,7 +304,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	ret = tcf_idr_check_alloc(tn, &index, act, bind);
 	if (!ret) {
 		ret = tcf_idr_create(tn, index, est, act,
-				     &act_bpf_ops, bind, true);
+				     &act_bpf_ops, bind, true, 0);
 		if (ret < 0) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 2e0ec6f80458..43a243081e7d 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -121,7 +121,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 	ret = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!ret) {
 		ret = tcf_idr_create(tn, index, est, a,
-				     &act_connmark_ops, bind, false);
+				     &act_connmark_ops, bind, false, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 66e54fada44c..16e67e1c1db1 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -68,8 +68,8 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 	index = parm->index;
 	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!err) {
-		ret = tcf_idr_create(tn, index, est, a,
-				     &act_csum_ops, bind, true);
+		ret = tcf_idr_create_from_flags(tn, index, est, a,
+						&act_csum_ops, bind, flags);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 92ec0bdb0547..68d6af56b243 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -688,8 +688,8 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
 		return err;
 
 	if (!err) {
-		err = tcf_idr_create(tn, index, est, a,
-				     &act_ct_ops, bind, true);
+		err = tcf_idr_create_from_flags(tn, index, est, a,
+						&act_ct_ops, bind, flags);
 		if (err) {
 			tcf_idr_cleanup(tn, index);
 			return err;
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 2205b2a934cc..b1e601007242 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -210,7 +210,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
 	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!err) {
 		ret = tcf_idr_create(tn, index, est, a,
-				     &act_ctinfo_ops, bind, false);
+				     &act_ctinfo_ops, bind, false, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index c3dc89160f3a..416065772719 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -99,8 +99,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 
 	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!err) {
-		ret = tcf_idr_create(tn, index, est, a,
-				     &act_gact_ops, bind, true);
+		ret = tcf_idr_create_from_flags(tn, index, est, a,
+						&act_gact_ops, bind, flags);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index f38d2a5fd608..d562c88cccbe 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -523,7 +523,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, index, est, a, &act_ife_ops,
-				     bind, true);
+				     bind, true, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			kfree(p);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index fbab70787477..400a2cfe8452 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -144,7 +144,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, index, est, a, ops, bind,
-				     false);
+				     false, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 17ed19d6dff4..b6e1b5bbb4da 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -148,8 +148,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
 			return -EINVAL;
 		}
-		ret = tcf_idr_create(tn, index, est, a,
-				     &act_mirred_ops, bind, true);
+		ret = tcf_idr_create_from_flags(tn, index, est, a,
+						&act_mirred_ops, bind, flags);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index efd7fe07141b..4d8c822b6aca 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -225,7 +225,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, index, est, a,
-				     &act_mpls_ops, bind, true);
+				     &act_mpls_ops, bind, true, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 51d631cef92c..88a1b79a1848 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -61,7 +61,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	err = tcf_idr_check_alloc(tn, &index, a, bind);
 	if (!err) {
 		ret = tcf_idr_create(tn, index, est, a,
-				     &act_nat_ops, bind, false);
+				     &act_nat_ops, bind, false, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index adf1cbd6ae46..d5eff6ac17a9 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -191,7 +191,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			goto out_free;
 		}
 		ret = tcf_idr_create(tn, index, est, a,
-				     &act_pedit_ops, bind, false);
+				     &act_pedit_ops, bind, false, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			goto out_free;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 7437b001f493..d96271590268 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -87,7 +87,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, index, NULL, a,
-				     &act_police_ops, bind, true);
+				     &act_police_ops, bind, true, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 6f9a745c3095..29b23bfaf10d 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -69,7 +69,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, index, est, a,
-				     &act_sample_ops, bind, true);
+				     &act_sample_ops, bind, true, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index b18890f3eb67..97639b259cd7 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -128,7 +128,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, index, est, a,
-				     &act_simp_ops, bind, false);
+				     &act_simp_ops, bind, false, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 25f3b7b56bea..5f7ca7f89ca2 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -165,7 +165,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, index, est, a,
-				     &act_skbedit_ops, bind, true);
+				     &act_skbedit_ops, bind, true, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 8e1dc0d6b4b0..39e6d94cfafb 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -143,7 +143,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, index, est, a,
-				     &act_skbmod_ops, bind, true);
+				     &act_skbmod_ops, bind, true, 0);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index b25e5124f571..cb34e5d57aaa 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -347,8 +347,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, index, est, a,
-				     &act_tunnel_key_ops, bind, true);
+		ret = tcf_idr_create_from_flags(tn, index, est, a,
+						&act_tunnel_key_ops, bind,
+						act_flags);
 		if (ret) {
 			NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
 			goto release_tun_meta;
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 4b4000338a09..b6939abc61eb 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -189,8 +189,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	action = parm->v_action;
 
 	if (!exists) {
-		ret = tcf_idr_create(tn, index, est, a,
-				     &act_vlan_ops, bind, true);
+		ret = tcf_idr_create_from_flags(tn, index, est, a,
+						&act_vlan_ops, bind, flags);
 		if (ret) {
 			tcf_idr_cleanup(tn, index);
 			return ret;
-- 
cgit v1.2.3-59-g8ed1b


From f1b9509c2fb0ef4db8d22dac9aef8e856a5d81f6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 30 Oct 2019 15:32:11 -0700
Subject: bpf: Replace prog_raw_tp+btf_id with prog_tracing

The bpf program type raw_tp together with 'expected_attach_type'
was the most appropriate api to indicate BTF-enabled raw_tp programs.
But during development it became apparent that 'expected_attach_type'
cannot be used and new 'attach_btf_id' field had to be introduced.
Which means that the information is duplicated in two fields where
one of them is ignored.
Clean it up by introducing new program type where both
'expected_attach_type' and 'attach_btf_id' fields have
specific meaning.
In the future 'expected_attach_type' will be extended
with other attach points that have similar semantics to raw_tp.
This patch is replacing BTF-enabled BPF_PROG_TYPE_RAW_TRACEPOINT with
prog_type = BPF_RPOG_TYPE_TRACING
expected_attach_type = BPF_TRACE_RAW_TP
attach_btf_id = btf_id of raw tracepoint inside the kernel
Future patches will add
expected_attach_type = BPF_TRACE_FENTRY or BPF_TRACE_FEXIT
where programs have the same input context and the same helpers,
but different attach points.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20191030223212.953010-2-ast@kernel.org
---
 include/linux/bpf.h       |  5 +++++
 include/linux/bpf_types.h |  1 +
 include/uapi/linux/bpf.h  |  2 ++
 kernel/bpf/syscall.c      |  6 +++---
 kernel/bpf/verifier.c     | 34 ++++++++++++++++++++++++----------
 kernel/trace/bpf_trace.c  | 44 ++++++++++++++++++++++++++++++++++++--------
 6 files changed, 71 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 171be30fe0ae..80158cff44bd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -373,6 +373,11 @@ enum bpf_cgroup_storage_type {
 
 #define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
 
+/* The longest tracepoint has 12 args.
+ * See include/trace/bpf_probe.h
+ */
+#define MAX_BPF_FUNC_ARGS 12
+
 struct bpf_prog_stats {
 	u64 cnt;
 	u64 nsecs;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 36a9c2325176..de14872b01ba 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -26,6 +26,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable)
+BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing)
 #endif
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4af8b0819a32..a6bf19dabaab 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -173,6 +173,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
 	BPF_PROG_TYPE_CGROUP_SOCKOPT,
+	BPF_PROG_TYPE_TRACING,
 };
 
 enum bpf_attach_type {
@@ -199,6 +200,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP6_RECVMSG,
 	BPF_CGROUP_GETSOCKOPT,
 	BPF_CGROUP_SETSOCKOPT,
+	BPF_TRACE_RAW_TP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ff5225759553..985d01ced196 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1571,7 +1571,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 			   u32 btf_id)
 {
 	switch (prog_type) {
-	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+	case BPF_PROG_TYPE_TRACING:
 		if (btf_id > BTF_MAX_TYPE)
 			return -EINVAL;
 		break;
@@ -1833,13 +1833,13 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 		return PTR_ERR(prog);
 
 	if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
+	    prog->type != BPF_PROG_TYPE_TRACING &&
 	    prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
 		err = -EINVAL;
 		goto out_put_prog;
 	}
 
-	if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT &&
-	    prog->aux->attach_btf_id) {
+	if (prog->type == BPF_PROG_TYPE_TRACING) {
 		if (attr->raw_tracepoint.name) {
 			/* raw_tp name should not be specified in raw_tp
 			 * programs that were verified via in-kernel BTF info
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6b0de04f8b91..2f2374967b36 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9381,24 +9381,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog;
 	u32 btf_id = prog->aux->attach_btf_id;
+	const char prefix[] = "btf_trace_";
 	const struct btf_type *t;
 	const char *tname;
 
-	if (prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT && btf_id) {
-		const char prefix[] = "btf_trace_";
+	if (prog->type != BPF_PROG_TYPE_TRACING)
+		return 0;
 
-		t = btf_type_by_id(btf_vmlinux, btf_id);
-		if (!t) {
-			verbose(env, "attach_btf_id %u is invalid\n", btf_id);
-			return -EINVAL;
-		}
+	if (!btf_id) {
+		verbose(env, "Tracing programs must provide btf_id\n");
+		return -EINVAL;
+	}
+	t = btf_type_by_id(btf_vmlinux, btf_id);
+	if (!t) {
+		verbose(env, "attach_btf_id %u is invalid\n", btf_id);
+		return -EINVAL;
+	}
+	tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+	if (!tname) {
+		verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id);
+		return -EINVAL;
+	}
+
+	switch (prog->expected_attach_type) {
+	case BPF_TRACE_RAW_TP:
 		if (!btf_type_is_typedef(t)) {
 			verbose(env, "attach_btf_id %u is not a typedef\n",
 				btf_id);
 			return -EINVAL;
 		}
-		tname = btf_name_by_offset(btf_vmlinux, t->name_off);
-		if (!tname || strncmp(prefix, tname, sizeof(prefix) - 1)) {
+		if (strncmp(prefix, tname, sizeof(prefix) - 1)) {
 			verbose(env, "attach_btf_id %u points to wrong type name %s\n",
 				btf_id, tname);
 			return -EINVAL;
@@ -9419,8 +9431,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		prog->aux->attach_func_name = tname;
 		prog->aux->attach_func_proto = t;
 		prog->aux->attach_btf_trace = true;
+		return 0;
+	default:
+		return -EINVAL;
 	}
-	return 0;
 }
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 571c25d60710..f50bf19f7a05 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1055,10 +1055,6 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto_raw_tp;
-#ifdef CONFIG_NET
-	case BPF_FUNC_skb_output:
-		return &bpf_skb_output_proto;
-#endif
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
 	case BPF_FUNC_get_stack:
@@ -1068,20 +1064,44 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+#ifdef CONFIG_NET
+	case BPF_FUNC_skb_output:
+		return &bpf_skb_output_proto;
+#endif
+	default:
+		return raw_tp_prog_func_proto(func_id, prog);
+	}
+}
+
 static bool raw_tp_prog_is_valid_access(int off, int size,
 					enum bpf_access_type type,
 					const struct bpf_prog *prog,
 					struct bpf_insn_access_aux *info)
 {
-	/* largest tracepoint in the kernel has 12 args */
-	if (off < 0 || off >= sizeof(__u64) * 12)
+	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
+		return false;
+	if (type != BPF_READ)
+		return false;
+	if (off % size != 0)
+		return false;
+	return true;
+}
+
+static bool tracing_prog_is_valid_access(int off, int size,
+					 enum bpf_access_type type,
+					 const struct bpf_prog *prog,
+					 struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
 		return false;
 	if (type != BPF_READ)
 		return false;
 	if (off % size != 0)
 		return false;
-	if (!prog->aux->attach_btf_id)
-		return true;
 	return btf_ctx_access(off, size, type, prog, info);
 }
 
@@ -1093,6 +1113,14 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
 const struct bpf_prog_ops raw_tracepoint_prog_ops = {
 };
 
+const struct bpf_verifier_ops tracing_verifier_ops = {
+	.get_func_proto  = tracing_prog_func_proto,
+	.is_valid_access = tracing_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops tracing_prog_ops = {
+};
+
 static bool raw_tp_writable_prog_is_valid_access(int off, int size,
 						 enum bpf_access_type type,
 						 const struct bpf_prog *prog,
-- 
cgit v1.2.3-59-g8ed1b


From 246880958ac93989c97c73ae1e60b78b4c4c88c5 Mon Sep 17 00:00:00 2001
From: Vikas Gupta <vikas.gupta@broadcom.com>
Date: Thu, 31 Oct 2019 15:38:50 +0530
Subject: firmware: broadcom: add OP-TEE based BNXT f/w manager

This driver registers on TEE bus to interact with OP-TEE based
BNXT firmware management modules

Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com>
Signed-off-by: Sheetal Tigadoli <sheetal.tigadoli@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/firmware/broadcom/Kconfig             |   8 +
 drivers/firmware/broadcom/Makefile            |   1 +
 drivers/firmware/broadcom/tee_bnxt_fw.c       | 279 ++++++++++++++++++++++++++
 include/linux/firmware/broadcom/tee_bnxt_fw.h |  14 ++
 4 files changed, 302 insertions(+)
 create mode 100644 drivers/firmware/broadcom/tee_bnxt_fw.c
 create mode 100644 include/linux/firmware/broadcom/tee_bnxt_fw.h

(limited to 'include')

diff --git a/drivers/firmware/broadcom/Kconfig b/drivers/firmware/broadcom/Kconfig
index d03ed8e43ad7..8e3d355a637a 100644
--- a/drivers/firmware/broadcom/Kconfig
+++ b/drivers/firmware/broadcom/Kconfig
@@ -22,3 +22,11 @@ config BCM47XX_SPROM
 	  In case of SoC devices SPROM content is stored on a flash used by
 	  bootloader firmware CFE. This driver provides method to ssb and bcma
 	  drivers to read SPROM on SoC.
+
+config TEE_BNXT_FW
+	tristate "Broadcom BNXT firmware manager"
+	depends on (ARCH_BCM_IPROC && OPTEE) || (COMPILE_TEST && TEE)
+	default ARCH_BCM_IPROC
+	help
+	  This module help to manage firmware on Broadcom BNXT device. The module
+	  registers on tee bus and invoke calls to manage firmware on BNXT device.
diff --git a/drivers/firmware/broadcom/Makefile b/drivers/firmware/broadcom/Makefile
index 72c7fdc20c77..17c5061c47a7 100644
--- a/drivers/firmware/broadcom/Makefile
+++ b/drivers/firmware/broadcom/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_BCM47XX_NVRAM)		+= bcm47xx_nvram.o
 obj-$(CONFIG_BCM47XX_SPROM)		+= bcm47xx_sprom.o
+obj-$(CONFIG_TEE_BNXT_FW)		+= tee_bnxt_fw.o
diff --git a/drivers/firmware/broadcom/tee_bnxt_fw.c b/drivers/firmware/broadcom/tee_bnxt_fw.c
new file mode 100644
index 000000000000..5b7ef89eb701
--- /dev/null
+++ b/drivers/firmware/broadcom/tee_bnxt_fw.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Broadcom.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include <linux/uuid.h>
+
+#include <linux/firmware/broadcom/tee_bnxt_fw.h>
+
+#define MAX_SHM_MEM_SZ	SZ_4M
+
+#define MAX_TEE_PARAM_ARRY_MEMB		4
+
+enum ta_cmd {
+	/*
+	 * TA_CMD_BNXT_FASTBOOT - boot bnxt device by copying f/w into sram
+	 *
+	 *	param[0] unused
+	 *	param[1] unused
+	 *	param[2] unused
+	 *	param[3] unused
+	 *
+	 * Result:
+	 *	TEE_SUCCESS - Invoke command success
+	 *	TEE_ERROR_ITEM_NOT_FOUND - Corrupt f/w image found on memory
+	 */
+	TA_CMD_BNXT_FASTBOOT = 0,
+
+	/*
+	 * TA_CMD_BNXT_COPY_COREDUMP - copy the core dump into shm
+	 *
+	 *	param[0] (inout memref) - Coredump buffer memory reference
+	 *	param[1] (in value) - value.a: offset, data to be copied from
+	 *			      value.b: size of data to be copied
+	 *	param[2] unused
+	 *	param[3] unused
+	 *
+	 * Result:
+	 *	TEE_SUCCESS - Invoke command success
+	 *	TEE_ERROR_BAD_PARAMETERS - Incorrect input param
+	 *	TEE_ERROR_ITEM_NOT_FOUND - Corrupt core dump
+	 */
+	TA_CMD_BNXT_COPY_COREDUMP = 3,
+};
+
+/**
+ * struct tee_bnxt_fw_private - OP-TEE bnxt private data
+ * @dev:		OP-TEE based bnxt device.
+ * @ctx:		OP-TEE context handler.
+ * @session_id:		TA session identifier.
+ */
+struct tee_bnxt_fw_private {
+	struct device *dev;
+	struct tee_context *ctx;
+	u32 session_id;
+	struct tee_shm *fw_shm_pool;
+};
+
+static struct tee_bnxt_fw_private pvt_data;
+
+static void prepare_args(int cmd,
+			 struct tee_ioctl_invoke_arg *arg,
+			 struct tee_param *param)
+{
+	memset(arg, 0, sizeof(*arg));
+	memset(param, 0, MAX_TEE_PARAM_ARRY_MEMB * sizeof(*param));
+
+	arg->func = cmd;
+	arg->session = pvt_data.session_id;
+	arg->num_params = MAX_TEE_PARAM_ARRY_MEMB;
+
+	/* Fill invoke cmd params */
+	switch (cmd) {
+	case TA_CMD_BNXT_COPY_COREDUMP:
+		param[0].attr = TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT;
+		param[0].u.memref.shm = pvt_data.fw_shm_pool;
+		param[0].u.memref.size = MAX_SHM_MEM_SZ;
+		param[0].u.memref.shm_offs = 0;
+		param[1].attr = TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT;
+		break;
+	case TA_CMD_BNXT_FASTBOOT:
+	default:
+		/* Nothing to do */
+		break;
+	}
+}
+
+/**
+ * tee_bnxt_fw_load() - Load the bnxt firmware
+ *		    Uses an OP-TEE call to start a secure
+ *		    boot process.
+ * Returns 0 on success, negative errno otherwise.
+ */
+int tee_bnxt_fw_load(void)
+{
+	int ret = 0;
+	struct tee_ioctl_invoke_arg arg;
+	struct tee_param param[MAX_TEE_PARAM_ARRY_MEMB];
+
+	if (!pvt_data.ctx)
+		return -ENODEV;
+
+	prepare_args(TA_CMD_BNXT_FASTBOOT, &arg, param);
+
+	ret = tee_client_invoke_func(pvt_data.ctx, &arg, param);
+	if (ret < 0 || arg.ret != 0) {
+		dev_err(pvt_data.dev,
+			"TA_CMD_BNXT_FASTBOOT invoke failed TEE err: %x, ret:%x\n",
+			arg.ret, ret);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(tee_bnxt_fw_load);
+
+/**
+ * tee_bnxt_copy_coredump() - Copy coredump from the allocated memory
+ *			    Uses an OP-TEE call to copy coredump
+ * @buf:	destination buffer where core dump is copied into
+ * @offset:	offset from the base address of core dump area
+ * @size:	size of the dump
+ *
+ * Returns 0 on success, negative errno otherwise.
+ */
+int tee_bnxt_copy_coredump(void *buf, u32 offset, u32 size)
+{
+	struct tee_ioctl_invoke_arg arg;
+	struct tee_param param[MAX_TEE_PARAM_ARRY_MEMB];
+	void *core_data;
+	u32 rbytes = size;
+	u32 nbytes = 0;
+	int ret = 0;
+
+	if (!pvt_data.ctx)
+		return -ENODEV;
+
+	prepare_args(TA_CMD_BNXT_COPY_COREDUMP, &arg, param);
+
+	while (rbytes)  {
+		nbytes = rbytes;
+
+		nbytes = min_t(u32, rbytes, param[0].u.memref.size);
+
+		/* Fill additional invoke cmd params */
+		param[1].u.value.a = offset;
+		param[1].u.value.b = nbytes;
+
+		ret = tee_client_invoke_func(pvt_data.ctx, &arg, param);
+		if (ret < 0 || arg.ret != 0) {
+			dev_err(pvt_data.dev,
+				"TA_CMD_BNXT_COPY_COREDUMP invoke failed TEE err: %x, ret:%x\n",
+				arg.ret, ret);
+			return -EINVAL;
+		}
+
+		core_data = tee_shm_get_va(pvt_data.fw_shm_pool, 0);
+		if (IS_ERR(core_data)) {
+			dev_err(pvt_data.dev, "tee_shm_get_va failed\n");
+			return PTR_ERR(core_data);
+		}
+
+		memcpy(buf, core_data, nbytes);
+
+		rbytes -= nbytes;
+		buf += nbytes;
+		offset += nbytes;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(tee_bnxt_copy_coredump);
+
+static int optee_ctx_match(struct tee_ioctl_version_data *ver, const void *data)
+{
+	return (ver->impl_id == TEE_IMPL_ID_OPTEE);
+}
+
+static int tee_bnxt_fw_probe(struct device *dev)
+{
+	struct tee_client_device *bnxt_device = to_tee_client_device(dev);
+	int ret, err = -ENODEV;
+	struct tee_ioctl_open_session_arg sess_arg;
+	struct tee_shm *fw_shm_pool;
+
+	memset(&sess_arg, 0, sizeof(sess_arg));
+
+	/* Open context with TEE driver */
+	pvt_data.ctx = tee_client_open_context(NULL, optee_ctx_match, NULL,
+					       NULL);
+	if (IS_ERR(pvt_data.ctx))
+		return -ENODEV;
+
+	/* Open session with Bnxt load Trusted App */
+	memcpy(sess_arg.uuid, bnxt_device->id.uuid.b, TEE_IOCTL_UUID_LEN);
+	sess_arg.clnt_login = TEE_IOCTL_LOGIN_PUBLIC;
+	sess_arg.num_params = 0;
+
+	ret = tee_client_open_session(pvt_data.ctx, &sess_arg, NULL);
+	if (ret < 0 || sess_arg.ret != 0) {
+		dev_err(dev, "tee_client_open_session failed, err: %x\n",
+			sess_arg.ret);
+		err = -EINVAL;
+		goto out_ctx;
+	}
+	pvt_data.session_id = sess_arg.session;
+
+	pvt_data.dev = dev;
+
+	fw_shm_pool = tee_shm_alloc(pvt_data.ctx, MAX_SHM_MEM_SZ,
+				    TEE_SHM_MAPPED | TEE_SHM_DMA_BUF);
+	if (IS_ERR(fw_shm_pool)) {
+		tee_client_close_context(pvt_data.ctx);
+		dev_err(pvt_data.dev, "tee_shm_alloc failed\n");
+		err = PTR_ERR(fw_shm_pool);
+		goto out_sess;
+	}
+
+	pvt_data.fw_shm_pool = fw_shm_pool;
+
+	return 0;
+
+out_sess:
+	tee_client_close_session(pvt_data.ctx, pvt_data.session_id);
+out_ctx:
+	tee_client_close_context(pvt_data.ctx);
+
+	return err;
+}
+
+static int tee_bnxt_fw_remove(struct device *dev)
+{
+	tee_shm_free(pvt_data.fw_shm_pool);
+	tee_client_close_session(pvt_data.ctx, pvt_data.session_id);
+	tee_client_close_context(pvt_data.ctx);
+	pvt_data.ctx = NULL;
+
+	return 0;
+}
+
+static const struct tee_client_device_id tee_bnxt_fw_id_table[] = {
+	{UUID_INIT(0x6272636D, 0x2019, 0x0716,
+		    0x42, 0x43, 0x4D, 0x5F, 0x53, 0x43, 0x48, 0x49)},
+	{}
+};
+
+MODULE_DEVICE_TABLE(tee, tee_bnxt_fw_id_table);
+
+static struct tee_client_driver tee_bnxt_fw_driver = {
+	.id_table	= tee_bnxt_fw_id_table,
+	.driver		= {
+		.name		= KBUILD_MODNAME,
+		.bus		= &tee_bus_type,
+		.probe		= tee_bnxt_fw_probe,
+		.remove		= tee_bnxt_fw_remove,
+	},
+};
+
+static int __init tee_bnxt_fw_mod_init(void)
+{
+	return driver_register(&tee_bnxt_fw_driver.driver);
+}
+
+static void __exit tee_bnxt_fw_mod_exit(void)
+{
+	driver_unregister(&tee_bnxt_fw_driver.driver);
+}
+
+module_init(tee_bnxt_fw_mod_init);
+module_exit(tee_bnxt_fw_mod_exit);
+
+MODULE_AUTHOR("Vikas Gupta <vikas.gupta@broadcom.com>");
+MODULE_DESCRIPTION("Broadcom bnxt firmware manager");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/firmware/broadcom/tee_bnxt_fw.h b/include/linux/firmware/broadcom/tee_bnxt_fw.h
new file mode 100644
index 000000000000..f24c82d6ef73
--- /dev/null
+++ b/include/linux/firmware/broadcom/tee_bnxt_fw.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * Copyright 2019 Broadcom.
+ */
+
+#ifndef _BROADCOM_TEE_BNXT_FW_H
+#define _BROADCOM_TEE_BNXT_FW_H
+
+#include <linux/types.h>
+
+int tee_bnxt_fw_load(void);
+int tee_bnxt_copy_coredump(void *buf, u32 offset, u32 size);
+
+#endif /* _BROADCOM_TEE_BNXT_FW_H */
-- 
cgit v1.2.3-59-g8ed1b


From a2d00f3db73dc4f6f6afcc95c1db809ea9019306 Mon Sep 17 00:00:00 2001
From: Madalin Bucur <madalin.bucur@nxp.com>
Date: Thu, 31 Oct 2019 16:37:58 +0200
Subject: soc: fsl: qbman: allow registering a device link for the portal user

Introduce the API required to make sure that the devices that use
the QMan portal are unbound when the portal is unbound.

Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/soc/fsl/qbman/qman.c | 13 +++++++++++++
 include/soc/fsl/qman.h       | 18 ++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
index bf68d86d80ee..bc75a5882b9e 100644
--- a/drivers/soc/fsl/qbman/qman.c
+++ b/drivers/soc/fsl/qbman/qman.c
@@ -1749,6 +1749,19 @@ struct qman_portal *qman_get_affine_portal(int cpu)
 }
 EXPORT_SYMBOL(qman_get_affine_portal);
 
+int qman_start_using_portal(struct qman_portal *p, struct device *dev)
+{
+	return (!device_link_add(dev, p->config->dev,
+				 DL_FLAG_AUTOREMOVE_CONSUMER)) ? -EINVAL : 0;
+}
+EXPORT_SYMBOL(qman_start_using_portal);
+
+void qman_stop_using_portal(struct qman_portal *p, struct device *dev)
+{
+	device_link_remove(dev, p->config->dev);
+}
+EXPORT_SYMBOL(qman_stop_using_portal);
+
 int qman_p_poll_dqrr(struct qman_portal *p, unsigned int limit)
 {
 	return __poll_portal_fast(p, limit);
diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h
index aa31c05a103a..c499c5cfa7c9 100644
--- a/include/soc/fsl/qman.h
+++ b/include/soc/fsl/qman.h
@@ -32,6 +32,7 @@
 #define __FSL_QMAN_H
 
 #include <linux/bitops.h>
+#include <linux/device.h>
 
 /* Hardware constants */
 #define QM_CHANNEL_SWPORTAL0 0
@@ -914,6 +915,23 @@ u16 qman_affine_channel(int cpu);
  */
 struct qman_portal *qman_get_affine_portal(int cpu);
 
+/**
+ * qman_start_using_portal - register a device link for the portal user
+ * @p: the portal that will be in use
+ * @dev: the device that will use the portal
+ *
+ * Makes sure that the devices that use the portal are unbound when the
+ * portal is unbound
+ */
+int qman_start_using_portal(struct qman_portal *p, struct device *dev);
+
+/**
+ * qman_stop_using_portal - deregister a device link for the portal user
+ * @p: the portal that will no longer be in use
+ * @dev: the device that uses the portal
+ */
+void qman_stop_using_portal(struct qman_portal *p, struct device *dev);
+
 /**
  * qman_p_poll_dqrr - process DQRR (fast-path) entries
  * @limit: the maximum number of DQRR entries to process
-- 
cgit v1.2.3-59-g8ed1b


From e06eea555b878f2c95b498aa1c485250ad30c960 Mon Sep 17 00:00:00 2001
From: Madalin Bucur <madalin.bucur@nxp.com>
Date: Thu, 31 Oct 2019 16:37:59 +0200
Subject: dpaa_eth: register a device link for the qman portal used

Before this change, unbinding the QMan portals did not trigger a
corresponding unbinding of the dpaa_eth making use of it; the first
QMan portal related operation issued afterwards crashed the kernel.
The device link ensures the dpaa_eth dependency upon the qman portal
used is honoured at the QMan portal removal.

Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 5 +++--
 drivers/soc/fsl/qbman/qman.c                   | 6 ------
 include/soc/fsl/qman.h                         | 7 -------
 3 files changed, 3 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index d8b41a0a7e3c..6a9d12dad5d9 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -750,7 +750,7 @@ static void dpaa_release_channel(void)
 	qman_release_pool(rx_pool_channel);
 }
 
-static void dpaa_eth_add_channel(u16 channel)
+static void dpaa_eth_add_channel(u16 channel, struct device *dev)
 {
 	u32 pool = QM_SDQCR_CHANNELS_POOL_CONV(channel);
 	const cpumask_t *cpus = qman_affine_cpus();
@@ -760,6 +760,7 @@ static void dpaa_eth_add_channel(u16 channel)
 	for_each_cpu_and(cpu, cpus, cpu_online_mask) {
 		portal = qman_get_affine_portal(cpu);
 		qman_p_static_dequeue_add(portal, pool);
+		qman_start_using_portal(portal, dev);
 	}
 }
 
@@ -2873,7 +2874,7 @@ static int dpaa_eth_probe(struct platform_device *pdev)
 	/* Walk the CPUs with affine portals
 	 * and add this pool channel to each's dequeue mask.
 	 */
-	dpaa_eth_add_channel(priv->channel);
+	dpaa_eth_add_channel(priv->channel, &pdev->dev);
 
 	dpaa_fq_setup(priv, &dpaa_fq_cbs, priv->mac_dev->port[TX]);
 
diff --git a/drivers/soc/fsl/qbman/qman.c b/drivers/soc/fsl/qbman/qman.c
index bc75a5882b9e..1e164e03410a 100644
--- a/drivers/soc/fsl/qbman/qman.c
+++ b/drivers/soc/fsl/qbman/qman.c
@@ -1756,12 +1756,6 @@ int qman_start_using_portal(struct qman_portal *p, struct device *dev)
 }
 EXPORT_SYMBOL(qman_start_using_portal);
 
-void qman_stop_using_portal(struct qman_portal *p, struct device *dev)
-{
-	device_link_remove(dev, p->config->dev);
-}
-EXPORT_SYMBOL(qman_stop_using_portal);
-
 int qman_p_poll_dqrr(struct qman_portal *p, unsigned int limit)
 {
 	return __poll_portal_fast(p, limit);
diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h
index c499c5cfa7c9..cfe00e08e85b 100644
--- a/include/soc/fsl/qman.h
+++ b/include/soc/fsl/qman.h
@@ -925,13 +925,6 @@ struct qman_portal *qman_get_affine_portal(int cpu);
  */
 int qman_start_using_portal(struct qman_portal *p, struct device *dev);
 
-/**
- * qman_stop_using_portal - deregister a device link for the portal user
- * @p: the portal that will no longer be in use
- * @dev: the device that uses the portal
- */
-void qman_stop_using_portal(struct qman_portal *p, struct device *dev);
-
 /**
  * qman_p_poll_dqrr - process DQRR (fast-path) entries
  * @limit: the maximum number of DQRR entries to process
-- 
cgit v1.2.3-59-g8ed1b


From 1ac210d128ef6e92698dd3aa4e2e03e831bc9906 Mon Sep 17 00:00:00 2001
From: Ioana Ciornei <ioana.ciornei@nxp.com>
Date: Thu, 31 Oct 2019 01:18:29 +0200
Subject: bus: fsl-mc: add the fsl_mc_get_endpoint function

Using the newly added fsl_mc_get_endpoint function a fsl-mc driver can
find its associated endpoint (another object at the other link of a MC
firmware link).

The API will be used in the following patch in order to discover the
connected DPMAC object of a DPNI.

Also, the fsl_mc_device_lookup function is made available to the entire
fsl-mc bus driver and not just for the dprc driver.

Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bus/fsl-mc/dprc-driver.c    |  6 ++---
 drivers/bus/fsl-mc/dprc.c           | 53 +++++++++++++++++++++++++++++++++++++
 drivers/bus/fsl-mc/fsl-mc-bus.c     | 33 +++++++++++++++++++++++
 drivers/bus/fsl-mc/fsl-mc-private.h | 42 +++++++++++++++++++++++++++++
 include/linux/fsl/mc.h              |  2 ++
 5 files changed, 132 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/fsl-mc/dprc-driver.c b/drivers/bus/fsl-mc/dprc-driver.c
index 52c7e15143d6..c8b1c3842c1a 100644
--- a/drivers/bus/fsl-mc/dprc-driver.c
+++ b/drivers/bus/fsl-mc/dprc-driver.c
@@ -104,10 +104,8 @@ static int __fsl_mc_device_match(struct device *dev, void *data)
 	return fsl_mc_device_match(mc_dev, obj_desc);
 }
 
-static struct fsl_mc_device *fsl_mc_device_lookup(struct fsl_mc_obj_desc
-								*obj_desc,
-						  struct fsl_mc_device
-								*mc_bus_dev)
+struct fsl_mc_device *fsl_mc_device_lookup(struct fsl_mc_obj_desc *obj_desc,
+					   struct fsl_mc_device *mc_bus_dev)
 {
 	struct device *dev;
 
diff --git a/drivers/bus/fsl-mc/dprc.c b/drivers/bus/fsl-mc/dprc.c
index 0fe3f52ae0de..602f030d84eb 100644
--- a/drivers/bus/fsl-mc/dprc.c
+++ b/drivers/bus/fsl-mc/dprc.c
@@ -554,3 +554,56 @@ int dprc_get_container_id(struct fsl_mc_io *mc_io,
 
 	return 0;
 }
+
+/**
+ * dprc_get_connection() - Get connected endpoint and link status if connection
+ *			exists.
+ * @mc_io:	Pointer to MC portal's I/O object
+ * @cmd_flags:	Command flags; one or more of 'MC_CMD_FLAG_'
+ * @token:	Token of DPRC object
+ * @endpoint1:	Endpoint 1 configuration parameters
+ * @endpoint2:	Returned endpoint 2 configuration parameters
+ * @state:	Returned link state:
+ *		1 - link is up;
+ *		0 - link is down;
+ *		-1 - no connection (endpoint2 information is irrelevant)
+ *
+ * Return:     '0' on Success; -ENOTCONN if connection does not exist.
+ */
+int dprc_get_connection(struct fsl_mc_io *mc_io,
+			u32 cmd_flags,
+			u16 token,
+			const struct dprc_endpoint *endpoint1,
+			struct dprc_endpoint *endpoint2,
+			int *state)
+{
+	struct dprc_cmd_get_connection *cmd_params;
+	struct dprc_rsp_get_connection *rsp_params;
+	struct fsl_mc_command cmd = { 0 };
+	int err, i;
+
+	/* prepare command */
+	cmd.header = mc_encode_cmd_header(DPRC_CMDID_GET_CONNECTION,
+					  cmd_flags,
+					  token);
+	cmd_params = (struct dprc_cmd_get_connection *)cmd.params;
+	cmd_params->ep1_id = cpu_to_le32(endpoint1->id);
+	cmd_params->ep1_interface_id = cpu_to_le16(endpoint1->if_id);
+	for (i = 0; i < 16; i++)
+		cmd_params->ep1_type[i] = endpoint1->type[i];
+
+	/* send command to mc */
+	err = mc_send_command(mc_io, &cmd);
+	if (err)
+		return -ENOTCONN;
+
+	/* retrieve response parameters */
+	rsp_params = (struct dprc_rsp_get_connection *)cmd.params;
+	endpoint2->id = le32_to_cpu(rsp_params->ep2_id);
+	endpoint2->if_id = le16_to_cpu(rsp_params->ep2_interface_id);
+	*state = le32_to_cpu(rsp_params->state);
+	for (i = 0; i < 16; i++)
+		endpoint2->type[i] = rsp_params->ep2_type[i];
+
+	return 0;
+}
diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index bb3c2fc7c5ba..a07cc19becdb 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -712,6 +712,39 @@ void fsl_mc_device_remove(struct fsl_mc_device *mc_dev)
 }
 EXPORT_SYMBOL_GPL(fsl_mc_device_remove);
 
+struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev)
+{
+	struct fsl_mc_device *mc_bus_dev, *endpoint;
+	struct fsl_mc_obj_desc endpoint_desc = { 0 };
+	struct dprc_endpoint endpoint1 = { 0 };
+	struct dprc_endpoint endpoint2 = { 0 };
+	int state, err;
+
+	mc_bus_dev = to_fsl_mc_device(mc_dev->dev.parent);
+	strcpy(endpoint1.type, mc_dev->obj_desc.type);
+	endpoint1.id = mc_dev->obj_desc.id;
+
+	err = dprc_get_connection(mc_bus_dev->mc_io, 0,
+				  mc_bus_dev->mc_handle,
+				  &endpoint1, &endpoint2,
+				  &state);
+
+	if (err == -ENOTCONN || state == -1)
+		return ERR_PTR(-ENOTCONN);
+
+	if (err < 0) {
+		dev_err(&mc_bus_dev->dev, "dprc_get_connection() = %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	strcpy(endpoint_desc.type, endpoint2.type);
+	endpoint_desc.id = endpoint2.id;
+	endpoint = fsl_mc_device_lookup(&endpoint_desc, mc_bus_dev);
+
+	return endpoint;
+}
+EXPORT_SYMBOL_GPL(fsl_mc_get_endpoint);
+
 static int parse_mc_ranges(struct device *dev,
 			   int *paddr_cells,
 			   int *mc_addr_cells,
diff --git a/drivers/bus/fsl-mc/fsl-mc-private.h b/drivers/bus/fsl-mc/fsl-mc-private.h
index 020fcc04ec8b..21ca8c756ee7 100644
--- a/drivers/bus/fsl-mc/fsl-mc-private.h
+++ b/drivers/bus/fsl-mc/fsl-mc-private.h
@@ -105,6 +105,8 @@ int dpmcp_reset(struct fsl_mc_io *mc_io,
 #define DPRC_CMDID_GET_OBJ_REG_V2               DPRC_CMD_V2(0x15E)
 #define DPRC_CMDID_SET_OBJ_IRQ                  DPRC_CMD(0x15F)
 
+#define DPRC_CMDID_GET_CONNECTION               DPRC_CMD(0x16C)
+
 struct dprc_cmd_open {
 	__le32 container_id;
 };
@@ -228,6 +230,22 @@ struct dprc_cmd_set_obj_irq {
 	u8 obj_type[16];
 };
 
+struct dprc_cmd_get_connection {
+	__le32 ep1_id;
+	__le16 ep1_interface_id;
+	u8 pad[2];
+	u8 ep1_type[16];
+};
+
+struct dprc_rsp_get_connection {
+	__le64 pad[3];
+	__le32 ep2_id;
+	__le16 ep2_interface_id;
+	__le16 pad1;
+	u8 ep2_type[16];
+	__le32 state;
+};
+
 /*
  * DPRC API for managing and querying DPAA resources
  */
@@ -392,6 +410,27 @@ int dprc_get_container_id(struct fsl_mc_io *mc_io,
 			  u32 cmd_flags,
 			  int *container_id);
 
+/**
+ * struct dprc_endpoint - Endpoint description for link connect/disconnect
+ *			operations
+ * @type:	Endpoint object type: NULL terminated string
+ * @id:		Endpoint object ID
+ * @if_id:	Interface ID; should be set for endpoints with multiple
+ *		interfaces ("dpsw", "dpdmux"); for others, always set to 0
+ */
+struct dprc_endpoint {
+	char type[16];
+	int id;
+	u16 if_id;
+};
+
+int dprc_get_connection(struct fsl_mc_io *mc_io,
+			u32 cmd_flags,
+			u16 token,
+			const struct dprc_endpoint *endpoint1,
+			struct dprc_endpoint *endpoint2,
+			int *state);
+
 /*
  * Data Path Buffer Pool (DPBP) API
  */
@@ -574,4 +613,7 @@ void fsl_destroy_mc_io(struct fsl_mc_io *mc_io);
 
 bool fsl_mc_is_root_dprc(struct device *dev);
 
+struct fsl_mc_device *fsl_mc_device_lookup(struct fsl_mc_obj_desc *obj_desc,
+					   struct fsl_mc_device *mc_bus_dev);
+
 #endif /* _FSL_MC_PRIVATE_H_ */
diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h
index 975553a9f75d..54d9436600c7 100644
--- a/include/linux/fsl/mc.h
+++ b/include/linux/fsl/mc.h
@@ -403,6 +403,8 @@ int __must_check fsl_mc_allocate_irqs(struct fsl_mc_device *mc_dev);
 
 void fsl_mc_free_irqs(struct fsl_mc_device *mc_dev);
 
+struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev);
+
 extern struct bus_type fsl_mc_bus_type;
 
 extern struct device_type fsl_mc_bus_dprc_type;
-- 
cgit v1.2.3-59-g8ed1b


From c5f51765a1f60b701840544faf3ca63204b8dc3c Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Wed, 30 Oct 2019 22:09:13 -0400
Subject: net: dsa: list DSA links in the fabric

Implement a new list of DSA links in the switch fabric itself, to
provide an alterative to the ds->rtable static arrays.

At the same time, provide a new dsa_routing_port() helper to abstract
the usage of ds->rtable in drivers. If there's no port to reach a
given device, return the first invalid port, ds->num_ports. This avoids
potential signedness errors or the need to define special values.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/chip.c |  8 ++++----
 include/net/dsa.h                | 29 +++++++++++++++++++++++++++-
 net/dsa/dsa2.c                   | 41 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 72 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 619cd081339e..66de492117ad 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1143,6 +1143,7 @@ static int mv88e6xxx_pri_setup(struct mv88e6xxx_chip *chip)
 
 static int mv88e6xxx_devmap_setup(struct mv88e6xxx_chip *chip)
 {
+	struct dsa_switch *ds = chip->ds;
 	int target, port;
 	int err;
 
@@ -1151,10 +1152,9 @@ static int mv88e6xxx_devmap_setup(struct mv88e6xxx_chip *chip)
 
 	/* Initialize the routing port to the 32 possible target devices */
 	for (target = 0; target < 32; target++) {
-		port = 0x1f;
-		if (target < DSA_MAX_SWITCHES)
-			if (chip->ds->rtable[target] != DSA_RTABLE_NONE)
-				port = chip->ds->rtable[target];
+		port = dsa_routing_port(ds, target);
+		if (port == ds->num_ports)
+			port = 0x1f;
 
 		err = mv88e6xxx_g2_device_mapping_write(chip, target, port);
 		if (err)
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 9aba326abb64..3d7366d634d8 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -123,6 +123,9 @@ struct dsa_switch_tree {
 	/* List of switch ports */
 	struct list_head ports;
 
+	/* List of DSA links composing the routing table */
+	struct list_head rtable;
+
 	/*
 	 * Data for the individual switch chips.
 	 */
@@ -214,6 +217,17 @@ struct dsa_port {
 	bool setup;
 };
 
+/* TODO: ideally DSA ports would have a single dp->link_dp member,
+ * and no dst->rtable nor this struct dsa_link would be needed,
+ * but this would require some more complex tree walking,
+ * so keep it stupid at the moment and list them all.
+ */
+struct dsa_link {
+	struct dsa_port *dp;
+	struct dsa_port *link_dp;
+	struct list_head list;
+};
+
 struct dsa_switch {
 	bool setup;
 
@@ -324,6 +338,19 @@ static inline u32 dsa_user_ports(struct dsa_switch *ds)
 	return mask;
 }
 
+/* Return the local port used to reach an arbitrary switch device */
+static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device)
+{
+	struct dsa_switch_tree *dst = ds->dst;
+	struct dsa_link *dl;
+
+	list_for_each_entry(dl, &dst->rtable, list)
+		if (dl->dp->ds == ds && dl->link_dp->ds->index == device)
+			return dl->dp->index;
+
+	return ds->num_ports;
+}
+
 /* Return the local port used to reach an arbitrary switch port */
 static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device,
 					    int port)
@@ -331,7 +358,7 @@ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device,
 	if (device == ds->index)
 		return port;
 	else
-		return ds->rtable[device];
+		return dsa_routing_port(ds, device);
 }
 
 /* Return the local port used to reach the dedicated CPU port */
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index e7aae96b54bb..222d7dbfcfea 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -45,6 +45,8 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index)
 
 	dst->index = index;
 
+	INIT_LIST_HEAD(&dst->rtable);
+
 	INIT_LIST_HEAD(&dst->ports);
 
 	INIT_LIST_HEAD(&dst->list);
@@ -122,6 +124,31 @@ static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst,
 	return NULL;
 }
 
+struct dsa_link *dsa_link_touch(struct dsa_port *dp, struct dsa_port *link_dp)
+{
+	struct dsa_switch *ds = dp->ds;
+	struct dsa_switch_tree *dst;
+	struct dsa_link *dl;
+
+	dst = ds->dst;
+
+	list_for_each_entry(dl, &dst->rtable, list)
+		if (dl->dp == dp && dl->link_dp == link_dp)
+			return dl;
+
+	dl = kzalloc(sizeof(*dl), GFP_KERNEL);
+	if (!dl)
+		return NULL;
+
+	dl->dp = dp;
+	dl->link_dp = link_dp;
+
+	INIT_LIST_HEAD(&dl->list);
+	list_add_tail(&dl->list, &dst->rtable);
+
+	return dl;
+}
+
 static bool dsa_port_setup_routing_table(struct dsa_port *dp)
 {
 	struct dsa_switch *ds = dp->ds;
@@ -129,6 +156,7 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp)
 	struct device_node *dn = dp->dn;
 	struct of_phandle_iterator it;
 	struct dsa_port *link_dp;
+	struct dsa_link *dl;
 	int err;
 
 	of_for_each_phandle(&it, err, dn, "link", NULL, 0) {
@@ -138,7 +166,11 @@ static bool dsa_port_setup_routing_table(struct dsa_port *dp)
 			return false;
 		}
 
-		ds->rtable[link_dp->ds->index] = dp->index;
+		dl = dsa_link_touch(dp, link_dp);
+		if (!dl) {
+			of_node_put(it.node);
+			return false;
+		}
 	}
 
 	return true;
@@ -544,6 +576,8 @@ teardown_default_cpu:
 
 static void dsa_tree_teardown(struct dsa_switch_tree *dst)
 {
+	struct dsa_link *dl, *next;
+
 	if (!dst->setup)
 		return;
 
@@ -553,6 +587,11 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst)
 
 	dsa_tree_teardown_default_cpu(dst);
 
+	list_for_each_entry_safe(dl, next, &dst->rtable, list) {
+		list_del(&dl->list);
+		kfree(dl);
+	}
+
 	pr_info("DSA: tree %d torn down\n", dst->index);
 
 	dst->setup = false;
-- 
cgit v1.2.3-59-g8ed1b


From 96252b8e05326df072cd321159878aa4725c5bd4 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Wed, 30 Oct 2019 22:09:14 -0400
Subject: net: dsa: remove ds->rtable

Drivers do not use the ds->rtable static arrays anymore, get rid of it.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 7 -------
 net/dsa/dsa2.c    | 4 ----
 2 files changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 3d7366d634d8..b46222adb5c2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -258,13 +258,6 @@ struct dsa_switch {
 	 */
 	const struct dsa_switch_ops	*ops;
 
-	/*
-	 * An array of which element [a] indicates which port on this
-	 * switch should be used to send packets to that are destined
-	 * for switch a. Can be NULL if there is only one switch chip.
-	 */
-	s8		rtable[DSA_MAX_SWITCHES];
-
 	/*
 	 * Slave mii_bus and devices for the individual ports.
 	 */
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 222d7dbfcfea..efd7453f308e 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -181,10 +181,6 @@ static bool dsa_switch_setup_routing_table(struct dsa_switch *ds)
 	struct dsa_switch_tree *dst = ds->dst;
 	bool complete = true;
 	struct dsa_port *dp;
-	int i;
-
-	for (i = 0; i < DSA_MAX_SWITCHES; i++)
-		ds->rtable[i] = DSA_RTABLE_NONE;
 
 	list_for_each_entry(dp, &dst->ports, list) {
 		if (dp->ds == ds && dsa_port_is_dsa(dp)) {
-- 
cgit v1.2.3-59-g8ed1b


From 9c8ad1ab66b577526a4c89e4a222e0fac431a2d6 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@gmail.com>
Date: Wed, 30 Oct 2019 22:09:16 -0400
Subject: net: dsa: remove the dst->ds array

Now that the DSA ports are listed in the switch fabric, there is
no need to store the dsa_switch structures from the drivers in the
fabric anymore. So get rid of the dst->ds static array.

Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 5 -----
 net/dsa/dsa2.c    | 7 -------
 2 files changed, 12 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index b46222adb5c2..e4c697b95c70 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -125,11 +125,6 @@ struct dsa_switch_tree {
 
 	/* List of DSA links composing the routing table */
 	struct list_head rtable;
-
-	/*
-	 * Data for the individual switch chips.
-	 */
-	struct dsa_switch	*ds[DSA_MAX_SWITCHES];
 };
 
 /* TC matchall action types, only mirroring for now */
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index a887231fff13..92e71b12b729 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -578,25 +578,18 @@ static void dsa_tree_remove_switch(struct dsa_switch_tree *dst,
 {
 	dsa_tree_teardown(dst);
 
-	dst->ds[index] = NULL;
 	dsa_tree_put(dst);
 }
 
 static int dsa_tree_add_switch(struct dsa_switch_tree *dst,
 			       struct dsa_switch *ds)
 {
-	unsigned int index = ds->index;
 	int err;
 
-	if (dst->ds[index])
-		return -EBUSY;
-
 	dsa_tree_get(dst);
-	dst->ds[index] = ds;
 
 	err = dsa_tree_setup(dst);
 	if (err) {
-		dst->ds[index] = NULL;
 		dsa_tree_put(dst);
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From be0c5677970d4f21dc701136a178437aad9983b2 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 1 Nov 2019 14:46:37 +0200
Subject: net: bridge: fdb: br_fdb_update can take flags directly

If we modify br_fdb_update() to take flags directly we can get rid of
one test and one atomic bitop in the learning path.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/bridge.h | 12 ++++++------
 net/bridge/br_fdb.c           | 15 ++++++---------
 net/bridge/br_input.c         |  4 ++--
 net/bridge/br_private.h       |  2 +-
 4 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h
index 8ea966448b58..6b200059c2c5 100644
--- a/include/trace/events/bridge.h
+++ b/include/trace/events/bridge.h
@@ -95,16 +95,16 @@ TRACE_EVENT(fdb_delete,
 TRACE_EVENT(br_fdb_update,
 
 	TP_PROTO(struct net_bridge *br, struct net_bridge_port *source,
-		 const unsigned char *addr, u16 vid, bool added_by_user),
+		 const unsigned char *addr, u16 vid, unsigned long flags),
 
-	TP_ARGS(br, source, addr, vid, added_by_user),
+	TP_ARGS(br, source, addr, vid, flags),
 
 	TP_STRUCT__entry(
 		__string(br_dev, br->dev->name)
 		__string(dev, source->dev->name)
 		__array(unsigned char, addr, ETH_ALEN)
 		__field(u16, vid)
-		__field(bool, added_by_user)
+		__field(unsigned long, flags)
 	),
 
 	TP_fast_assign(
@@ -112,14 +112,14 @@ TRACE_EVENT(br_fdb_update,
 		__assign_str(dev, source->dev->name);
 		memcpy(__entry->addr, addr, ETH_ALEN);
 		__entry->vid = vid;
-		__entry->added_by_user = added_by_user;
+		__entry->flags = flags;
 	),
 
-	TP_printk("br_dev %s source %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u added_by_user %d",
+	TP_printk("br_dev %s source %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u flags 0x%lx",
 		  __get_str(br_dev), __get_str(dev), __entry->addr[0],
 		  __entry->addr[1], __entry->addr[2], __entry->addr[3],
 		  __entry->addr[4], __entry->addr[5], __entry->vid,
-		  __entry->added_by_user)
+		  __entry->flags)
 );
 
 
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index f244f2ac7156..b37e0f4c1b2b 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -557,7 +557,7 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 }
 
 void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
-		   const unsigned char *addr, u16 vid, bool added_by_user)
+		   const unsigned char *addr, u16 vid, unsigned long flags)
 {
 	struct net_bridge_fdb_entry *fdb;
 	bool fdb_modified = false;
@@ -592,21 +592,18 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 			}
 			if (now != fdb->updated)
 				fdb->updated = now;
-			if (unlikely(added_by_user))
+			if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags)))
 				set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
 			if (unlikely(fdb_modified)) {
-				trace_br_fdb_update(br, source, addr, vid, added_by_user);
+				trace_br_fdb_update(br, source, addr, vid, flags);
 				fdb_notify(br, fdb, RTM_NEWNEIGH, true);
 			}
 		}
 	} else {
 		spin_lock(&br->hash_lock);
-		fdb = fdb_create(br, source, addr, vid, 0);
+		fdb = fdb_create(br, source, addr, vid, flags);
 		if (fdb) {
-			if (unlikely(added_by_user))
-				set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
-			trace_br_fdb_update(br, source, addr, vid,
-					    added_by_user);
+			trace_br_fdb_update(br, source, addr, vid, flags);
 			fdb_notify(br, fdb, RTM_NEWNEIGH, true);
 		}
 		/* else  we lose race and someone else inserts
@@ -889,7 +886,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
 		}
 		local_bh_disable();
 		rcu_read_lock();
-		br_fdb_update(br, p, addr, vid, true);
+		br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER));
 		rcu_read_unlock();
 		local_bh_enable();
 	} else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 7f5f646dba6e..f37b05090f45 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -88,7 +88,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 	/* insert into forwarding database after filtering to avoid spoofing */
 	br = p->br;
 	if (p->flags & BR_LEARNING)
-		br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
+		br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0);
 
 	local_rcv = !!(br->dev->flags & IFF_PROMISC);
 	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) {
@@ -184,7 +184,7 @@ static void __br_handle_local_finish(struct sk_buff *skb)
 	if ((p->flags & BR_LEARNING) &&
 	    !br_opt_get(p->br, BROPT_NO_LL_LEARN) &&
 	    br_should_learn(p, skb, &vid))
-		br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
+		br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0);
 }
 
 /* note: already called with rcu_read_lock */
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index f4754bf7f4bd..08742bff9bf0 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -571,7 +571,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
 int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
 		  const unsigned char *addr, u16 vid);
 void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
-		   const unsigned char *addr, u16 vid, bool added_by_user);
+		   const unsigned char *addr, u16 vid, unsigned long flags);
 
 int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
 		  struct net_device *dev, const unsigned char *addr, u16 vid);
-- 
cgit v1.2.3-59-g8ed1b


From e53a9d26cf80565cfb7172fc52a0dfac73613a0f Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Mon, 28 Oct 2019 23:35:30 +0000
Subject: IB/mlx5: Introduce and use mlx5_core_is_vf()

Instead of deciding a given device is virtual function or
not based on a device is PF or not, use already defined
MLX5_COREDEV_VF by introducing an helper API mlx5_core_is_vf().

This enables to clearly identify PF, VF and non virtual functions.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 2 +-
 include/linux/mlx5/driver.h       | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 831539419c30..8343a740c91e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1031,7 +1031,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	if (MLX5_CAP_GEN(mdev, cd))
 		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
 
-	if (!mlx5_core_is_pf(mdev))
+	if (mlx5_core_is_vf(mdev))
 		props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
 
 	if (mlx5_ib_port_link_layer(ibdev, 1) ==
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3e80f03a387f..7b4801e96feb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1121,6 +1121,11 @@ static inline bool mlx5_core_is_pf(const struct mlx5_core_dev *dev)
 	return dev->coredev_type == MLX5_COREDEV_PF;
 }
 
+static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev)
+{
+	return dev->coredev_type == MLX5_COREDEV_VF;
+}
+
 static inline bool mlx5_core_is_ecpf(struct mlx5_core_dev *dev)
 {
 	return dev->caps.embedded_cpu;
-- 
cgit v1.2.3-59-g8ed1b


From d817991cc7486ab83f6c7188b0bc80eebee872f6 Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Fri, 1 Nov 2019 12:03:46 +0100
Subject: xsk: Restructure/inline XSKMAP lookup/redirect/flush
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In this commit the XSKMAP entry lookup function used by the XDP
redirect code is moved from the xskmap.c file to the xdp_sock.h
header, so the lookup can be inlined from, e.g., the
bpf_xdp_redirect_map() function.

Further the __xsk_map_redirect() and __xsk_map_flush() is moved to the
xsk.c, which lets the compiler inline the xsk_rcv() and xsk_flush()
functions.

Finally, all the XDP socket functions were moved from linux/bpf.h to
net/xdp_sock.h, where most of the XDP sockets functions are anyway.

This yields a ~2% performance boost for the xdpsock "rx_drop"
scenario.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191101110346.15004-4-bjorn.topel@gmail.com
---
 include/linux/bpf.h    | 25 -------------------------
 include/net/xdp_sock.h | 51 ++++++++++++++++++++++++++++++++++++++------------
 kernel/bpf/xskmap.c    | 48 -----------------------------------------------
 net/xdp/xsk.c          | 33 ++++++++++++++++++++++++++++++--
 4 files changed, 70 insertions(+), 87 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 80158cff44bd..7c7f518811a6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1009,31 +1009,6 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr,
 }
 #endif
 
-#if defined(CONFIG_XDP_SOCKETS)
-struct xdp_sock;
-struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key);
-int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
-		       struct xdp_sock *xs);
-void __xsk_map_flush(struct bpf_map *map);
-#else
-struct xdp_sock;
-static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
-						     u32 key)
-{
-	return NULL;
-}
-
-static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
-				     struct xdp_sock *xs)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void __xsk_map_flush(struct bpf_map *map)
-{
-}
-#endif
-
 #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
 void bpf_sk_reuseport_detach(struct sock *sk);
 int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index c9398ce7960f..e3780e4b74e1 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -69,7 +69,14 @@ struct xdp_umem {
 /* Nodes are linked in the struct xdp_sock map_list field, and used to
  * track which maps a certain socket reside in.
  */
-struct xsk_map;
+
+struct xsk_map {
+	struct bpf_map map;
+	struct list_head __percpu *flush_list;
+	spinlock_t lock; /* Synchronize map updates */
+	struct xdp_sock *xsk_map[];
+};
+
 struct xsk_map_node {
 	struct list_head node;
 	struct xsk_map *map;
@@ -109,8 +116,6 @@ struct xdp_sock {
 struct xdp_buff;
 #ifdef CONFIG_XDP_SOCKETS
 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
-int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
-void xsk_flush(struct xdp_sock *xs);
 bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
 /* Used from netdev driver */
 bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
@@ -134,6 +139,22 @@ void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
 			     struct xdp_sock **map_entry);
 int xsk_map_inc(struct xsk_map *map);
 void xsk_map_put(struct xsk_map *map);
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+		       struct xdp_sock *xs);
+void __xsk_map_flush(struct bpf_map *map);
+
+static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
+						     u32 key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *xs;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	xs = READ_ONCE(m->xsk_map[key]);
+	return xs;
+}
 
 static inline u64 xsk_umem_extract_addr(u64 addr)
 {
@@ -224,15 +245,6 @@ static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return -ENOTSUPP;
 }
 
-static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
-{
-	return -ENOTSUPP;
-}
-
-static inline void xsk_flush(struct xdp_sock *xs)
-{
-}
-
 static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
 {
 	return false;
@@ -357,6 +369,21 @@ static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
 	return 0;
 }
 
+static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+				     struct xdp_sock *xs)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void __xsk_map_flush(struct bpf_map *map)
+{
+}
+
+static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
+						     u32 key)
+{
+	return NULL;
+}
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 554939f78b83..da16c30868f3 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -9,13 +9,6 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 
-struct xsk_map {
-	struct bpf_map map;
-	struct list_head __percpu *flush_list;
-	spinlock_t lock; /* Synchronize map updates */
-	struct xdp_sock *xsk_map[];
-};
-
 int xsk_map_inc(struct xsk_map *map)
 {
 	struct bpf_map *m = &map->map;
@@ -151,18 +144,6 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
-{
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct xdp_sock *xs;
-
-	if (key >= map->max_entries)
-		return NULL;
-
-	xs = READ_ONCE(m->xsk_map[key]);
-	return xs;
-}
-
 static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
 	const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
@@ -179,35 +160,6 @@ static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 	return insn - insn_buf;
 }
 
-int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
-		       struct xdp_sock *xs)
-{
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
-	int err;
-
-	err = xsk_rcv(xs, xdp);
-	if (err)
-		return err;
-
-	if (!xs->flush_node.prev)
-		list_add(&xs->flush_node, flush_list);
-
-	return 0;
-}
-
-void __xsk_map_flush(struct bpf_map *map)
-{
-	struct xsk_map *m = container_of(map, struct xsk_map, map);
-	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
-	struct xdp_sock *xs, *tmp;
-
-	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
-		xsk_flush(xs);
-		__list_del_clearprev(&xs->flush_node);
-	}
-}
-
 static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 9044073fbf22..6040bc2b0088 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -196,7 +196,7 @@ static bool xsk_is_bound(struct xdp_sock *xs)
 	return false;
 }
 
-int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
+static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	u32 len;
 
@@ -212,7 +212,7 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 		__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
 }
 
-void xsk_flush(struct xdp_sock *xs)
+static void xsk_flush(struct xdp_sock *xs)
 {
 	xskq_produce_flush_desc(xs->rx);
 	xs->sk.sk_data_ready(&xs->sk);
@@ -264,6 +264,35 @@ out_unlock:
 	return err;
 }
 
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+		       struct xdp_sock *xs)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	int err;
+
+	err = xsk_rcv(xs, xdp);
+	if (err)
+		return err;
+
+	if (!xs->flush_node.prev)
+		list_add(&xs->flush_node, flush_list);
+
+	return 0;
+}
+
+void __xsk_map_flush(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	struct xdp_sock *xs, *tmp;
+
+	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+		xsk_flush(xs);
+		__list_del_clearprev(&xs->flush_node);
+	}
+}
+
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 {
 	xskq_produce_flush_addr_n(umem->cq, nb_entries);
-- 
cgit v1.2.3-59-g8ed1b


From 1d1585ca0f48fe7ed95c3571f3e4a82b2b5045dc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Nov 2019 00:17:56 +0100
Subject: uaccess: Add non-pagefault user-space write function

Commit 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions")
missed to add probe write function, therefore factor out a probe_write_common()
helper with most logic of probe_kernel_write() except setting KERNEL_DS, and
add a new probe_user_write() helper so it can be used from BPF side.

Again, on some archs, the user address space and kernel address space can
co-exist and be overlapping, so in such case, setting KERNEL_DS would mean
that the given address is treated as being in kernel address space.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lore.kernel.org/bpf/9df2542e68141bfa3addde631441ee45503856a8.1572649915.git.daniel@iogearbox.net
---
 include/linux/uaccess.h | 12 ++++++++++++
 mm/maccess.c            | 45 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index d4ee6e942562..38555435a64a 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -337,6 +337,18 @@ extern long __probe_user_read(void *dst, const void __user *src, size_t size);
 extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
 extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
 
+/*
+ * probe_user_write(): safely attempt to write to a location in user space
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+extern long notrace probe_user_write(void __user *dst, const void *src, size_t size);
+extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size);
+
 extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
 extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
 				     long count);
diff --git a/mm/maccess.c b/mm/maccess.c
index d065736f6b87..2d3c3d01064c 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -18,6 +18,18 @@ probe_read_common(void *dst, const void __user *src, size_t size)
 	return ret ? -EFAULT : 0;
 }
 
+static __always_inline long
+probe_write_common(void __user *dst, const void *src, size_t size)
+{
+	long ret;
+
+	pagefault_disable();
+	ret = __copy_to_user_inatomic(dst, src, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+
 /**
  * probe_kernel_read(): safely attempt to read from a kernel-space location
  * @dst: pointer to the buffer that shall take the data
@@ -85,6 +97,7 @@ EXPORT_SYMBOL_GPL(probe_user_read);
  * Safely write to address @dst from the buffer at @src.  If a kernel fault
  * happens, handle that and return -EFAULT.
  */
+
 long __weak probe_kernel_write(void *dst, const void *src, size_t size)
     __attribute__((alias("__probe_kernel_write")));
 
@@ -94,15 +107,39 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
 	mm_segment_t old_fs = get_fs();
 
 	set_fs(KERNEL_DS);
-	pagefault_disable();
-	ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
-	pagefault_enable();
+	ret = probe_write_common((__force void __user *)dst, src, size);
 	set_fs(old_fs);
 
-	return ret ? -EFAULT : 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(probe_kernel_write);
 
+/**
+ * probe_user_write(): safely attempt to write to a user-space location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_write(void __user *dst, const void *src, size_t size)
+    __attribute__((alias("__probe_user_write")));
+
+long __probe_user_write(void __user *dst, const void *src, size_t size)
+{
+	long ret = -EFAULT;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(USER_DS);
+	if (access_ok(dst, size))
+		ret = probe_write_common(dst, src, size);
+	set_fs(old_fs);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_write);
 
 /**
  * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
-- 
cgit v1.2.3-59-g8ed1b


From 75a1a607bb7e6d918be3aca11ec2214a275392f4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Nov 2019 00:17:57 +0100
Subject: uaccess: Add strict non-pagefault kernel-space read function

Add two new probe_kernel_read_strict() and strncpy_from_unsafe_strict()
helpers which by default alias to the __probe_kernel_read() and the
__strncpy_from_unsafe(), respectively, but can be overridden by archs
which have non-overlapping address ranges for kernel space and user
space in order to bail out with -EFAULT when attempting to probe user
memory including non-canonical user access addresses [0]:

  4-level page tables:
    user-space mem: 0x0000000000000000 - 0x00007fffffffffff
    non-canonical:  0x0000800000000000 - 0xffff7fffffffffff

  5-level page tables:
    user-space mem: 0x0000000000000000 - 0x00ffffffffffffff
    non-canonical:  0x0100000000000000 - 0xfeffffffffffffff

The idea is that these helpers are complementary to the probe_user_read()
and strncpy_from_unsafe_user() which probe user-only memory. Both added
helpers here do the same, but for kernel-only addresses.

Both set of helpers are going to be used for BPF tracing. They also
explicitly avoid throwing the splat for non-canonical user addresses from
00c42373d397 ("x86-64: add warning for non-canonical user access address
dereferences").

For compat, the current probe_kernel_read() and strncpy_from_unsafe() are
left as-is.

  [0] Documentation/x86/x86_64/mm.txt

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: x86@kernel.org
Link: https://lore.kernel.org/bpf/eefeefd769aa5a013531f491a71f0936779e916b.1572649915.git.daniel@iogearbox.net
---
 arch/x86/mm/Makefile    |  2 +-
 arch/x86/mm/maccess.c   | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/uaccess.h |  4 ++++
 mm/maccess.c            | 25 ++++++++++++++++++++++++-
 4 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/mm/maccess.c

(limited to 'include')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 84373dc9b341..bbc68a54795e 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,7 +13,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o	= -pg
 endif
 
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
+	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o
 
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c
new file mode 100644
index 000000000000..f5b85bdc0535
--- /dev/null
+++ b/arch/x86/mm/maccess.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+
+#ifdef CONFIG_X86_64
+static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits)
+{
+	return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
+}
+
+static __always_inline bool invalid_probe_range(u64 vaddr)
+{
+	/*
+	 * Range covering the highest possible canonical userspace address
+	 * as well as non-canonical address range. For the canonical range
+	 * we also need to include the userspace guard page.
+	 */
+	return vaddr < TASK_SIZE_MAX + PAGE_SIZE ||
+	       canonical_address(vaddr, boot_cpu_data.x86_virt_bits) != vaddr;
+}
+#else
+static __always_inline bool invalid_probe_range(u64 vaddr)
+{
+	return vaddr < TASK_SIZE_MAX;
+}
+#endif
+
+long probe_kernel_read_strict(void *dst, const void *src, size_t size)
+{
+	if (unlikely(invalid_probe_range((unsigned long)src)))
+		return -EFAULT;
+
+	return __probe_kernel_read(dst, src, size);
+}
+
+long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, long count)
+{
+	if (unlikely(invalid_probe_range((unsigned long)unsafe_addr)))
+		return -EFAULT;
+
+	return __strncpy_from_unsafe(dst, unsafe_addr, count);
+}
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 38555435a64a..67f016010aad 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -311,6 +311,7 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
  * happens, handle that and return -EFAULT.
  */
 extern long probe_kernel_read(void *dst, const void *src, size_t size);
+extern long probe_kernel_read_strict(void *dst, const void *src, size_t size);
 extern long __probe_kernel_read(void *dst, const void *src, size_t size);
 
 /*
@@ -350,6 +351,9 @@ extern long notrace probe_user_write(void __user *dst, const void *src, size_t s
 extern long notrace __probe_user_write(void __user *dst, const void *src, size_t size);
 
 extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
+extern long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
+				       long count);
+extern long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
 extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
 				     long count);
 extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count);
diff --git a/mm/maccess.c b/mm/maccess.c
index 2d3c3d01064c..3ca8d97e5010 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -43,11 +43,20 @@ probe_write_common(void __user *dst, const void *src, size_t size)
  * do_page_fault() doesn't attempt to take mmap_sem.  This makes
  * probe_kernel_read() suitable for use within regions where the caller
  * already holds mmap_sem, or other locks which nest inside mmap_sem.
+ *
+ * probe_kernel_read_strict() is the same as probe_kernel_read() except for
+ * the case where architectures have non-overlapping user and kernel address
+ * ranges: probe_kernel_read_strict() will additionally return -EFAULT for
+ * probing memory on a user address range where probe_user_read() is supposed
+ * to be used instead.
  */
 
 long __weak probe_kernel_read(void *dst, const void *src, size_t size)
     __attribute__((alias("__probe_kernel_read")));
 
+long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size)
+    __attribute__((alias("__probe_kernel_read")));
+
 long __probe_kernel_read(void *dst, const void *src, size_t size)
 {
 	long ret;
@@ -157,8 +166,22 @@ EXPORT_SYMBOL_GPL(probe_user_write);
  *
  * If @count is smaller than the length of the string, copies @count-1 bytes,
  * sets the last byte of @dst buffer to NUL and returns @count.
+ *
+ * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except
+ * for the case where architectures have non-overlapping user and kernel address
+ * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for
+ * probing memory on a user address range where strncpy_from_unsafe_user() is
+ * supposed to be used instead.
  */
-long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+
+long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+    __attribute__((alias("__strncpy_from_unsafe")));
+
+long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
+				       long count)
+    __attribute__((alias("__strncpy_from_unsafe")));
+
+long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
 {
 	mm_segment_t old_fs = get_fs();
 	const void *src = unsafe_addr;
-- 
cgit v1.2.3-59-g8ed1b


From 6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 2 Nov 2019 00:17:59 +0100
Subject: bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str
 helpers

The current bpf_probe_read() and bpf_probe_read_str() helpers are broken
in that they assume they can be used for probing memory access for kernel
space addresses /as well as/ user space addresses.

However, plain use of probe_kernel_read() for both cases will attempt to
always access kernel space address space given access is performed under
KERNEL_DS and some archs in-fact have overlapping address spaces where a
kernel pointer and user pointer would have the /same/ address value and
therefore accessing application memory via bpf_probe_read{,_str}() would
read garbage values.

Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess:
Add non-pagefault user-space read functions"). Unfortunately, the only way
to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}()
and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}()
helpers are kept as-is to retain their current behavior.

The two *_user() variants attempt the access always under USER_DS set, the
two *_kernel() variants will -EFAULT when accessing user memory if the
underlying architecture has non-overlapping address ranges, also avoiding
throwing the kernel warning via 00c42373d397 ("x86-64: add warning for
non-canonical user access address dereferences").

Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper")
Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net
---
 include/uapi/linux/bpf.h       | 122 ++++++++++++++++++---------
 kernel/trace/bpf_trace.c       | 181 ++++++++++++++++++++++++++++++-----------
 tools/include/uapi/linux/bpf.h | 122 ++++++++++++++++++---------
 3 files changed, 299 insertions(+), 126 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a6bf19dabaab..df6809a76404 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -563,10 +563,13 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
- * 		address *src* and store the data in *dst*.
+ * 		kernel space address *unsafe_ptr* and store the data in *dst*.
+ *
+ * 		Generally, use bpf_probe_read_user() or bpf_probe_read_kernel()
+ * 		instead.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1428,45 +1431,14 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
- * 		Copy a NUL terminated string from an unsafe address
- * 		*unsafe_ptr* to *dst*. The *size* should include the
- * 		terminating NUL byte. In case the string length is smaller than
- * 		*size*, the target is not padded with further NUL bytes. If the
- * 		string length is larger than *size*, just *size*-1 bytes are
- * 		copied and the last byte is set to NUL.
- *
- * 		On success, the length of the copied string is returned. This
- * 		makes this helper useful in tracing programs for reading
- * 		strings, and more importantly to get its length at runtime. See
- * 		the following snippet:
- *
- * 		::
- *
- * 			SEC("kprobe/sys_open")
- * 			void bpf_sys_open(struct pt_regs *ctx)
- * 			{
- * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
- * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
- * 				                             ctx->di);
- *
- * 				// Consume buf, for example push it to
- * 				// userspace via bpf_perf_event_output(); we
- * 				// can use res (the string length) as event
- * 				// size, after checking its boundaries.
- * 			}
- *
- * 		In comparison, using **bpf_probe_read()** helper here instead
- * 		to read the string would require to estimate the length at
- * 		compile time, and would often result in copying more memory
- * 		than necessary.
+ * 		Copy a NUL terminated string from an unsafe kernel address
+ * 		*unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for
+ * 		more details.
  *
- * 		Another useful use case is when parsing individual process
- * 		arguments or individual environment variables navigating
- * 		*current*\ **->mm->arg_start** and *current*\
- * 		**->mm->env_start**: using this helper and the return value,
- * 		one can quickly iterate at the right offset of the memory area.
+ * 		Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str()
+ * 		instead.
  * 	Return
  * 		On success, the strictly positive length of the string,
  * 		including the trailing NUL character. On error, a negative
@@ -2777,6 +2749,72 @@ union bpf_attr {
  * 		restricted to raw_tracepoint bpf programs.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr)
+ * 	Description
+ * 		Safely attempt to read *size* bytes from user space address
+ * 		*unsafe_ptr* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
+ * 	Description
+ * 		Safely attempt to read *size* bytes from kernel space address
+ * 		*unsafe_ptr* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe user address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_user_str(buf, sizeof(buf),
+ * 				                                  ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read_user()** helper here
+ * 		instead to read the string would require to estimate the length
+ * 		at compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ *
+ * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
+ * 		to *dst*. Same semantics as with bpf_probe_read_user_str() apply.
+ * 	Return
+ * 		On success, the strictly positive length of the string,	including
+ * 		the trailing NUL character. On error, a negative value.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2890,7 +2928,11 @@ union bpf_attr {
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
 	FN(tcp_gen_syncookie),		\
-	FN(skb_output),
+	FN(skb_output),			\
+	FN(probe_read_user),		\
+	FN(probe_read_kernel),		\
+	FN(probe_read_user_str),	\
+	FN(probe_read_kernel_str),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d87fcdcb19b..ffc91d4935ac 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -138,24 +138,140 @@ static const struct bpf_func_proto bpf_override_return_proto = {
 };
 #endif
 
-BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
+BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
+	   const void __user *, unsafe_ptr)
 {
-	int ret;
+	int ret = probe_user_read(dst, unsafe_ptr, size);
 
-	ret = security_locked_down(LOCKDOWN_BPF_READ);
-	if (ret < 0)
-		goto out;
+	if (unlikely(ret < 0))
+		memset(dst, 0, size);
+
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_probe_read_user_proto = {
+	.func		= bpf_probe_read_user,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
+	   const void __user *, unsafe_ptr)
+{
+	int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size);
+
+	if (unlikely(ret < 0))
+		memset(dst, 0, size);
+
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
+	.func		= bpf_probe_read_user_str,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+};
 
-	ret = probe_kernel_read(dst, unsafe_ptr, size);
+static __always_inline int
+bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr,
+			     const bool compat)
+{
+	int ret = security_locked_down(LOCKDOWN_BPF_READ);
+
+	if (unlikely(ret < 0))
+		goto out;
+	ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) :
+	      probe_kernel_read_strict(dst, unsafe_ptr, size);
 	if (unlikely(ret < 0))
 out:
 		memset(dst, 0, size);
+	return ret;
+}
+
+BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
+	   const void *, unsafe_ptr)
+{
+	return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false);
+}
+
+static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
+	.func		= bpf_probe_read_kernel,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
+	   const void *, unsafe_ptr)
+{
+	return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true);
+}
 
+static const struct bpf_func_proto bpf_probe_read_compat_proto = {
+	.func		= bpf_probe_read_compat,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+static __always_inline int
+bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr,
+				 const bool compat)
+{
+	int ret = security_locked_down(LOCKDOWN_BPF_READ);
+
+	if (unlikely(ret < 0))
+		goto out;
+	/*
+	 * The strncpy_from_unsafe_*() call will likely not fill the entire
+	 * buffer, but that's okay in this circumstance as we're probing
+	 * arbitrary memory anyway similar to bpf_probe_read_*() and might
+	 * as well probe the stack. Thus, memory is explicitly cleared
+	 * only in error case, so that improper users ignoring return
+	 * code altogether don't copy garbage; otherwise length of string
+	 * is returned that can be used for bpf_perf_event_output() et al.
+	 */
+	ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) :
+	      strncpy_from_unsafe_strict(dst, unsafe_ptr, size);
+	if (unlikely(ret < 0))
+out:
+		memset(dst, 0, size);
 	return ret;
 }
 
-static const struct bpf_func_proto bpf_probe_read_proto = {
-	.func		= bpf_probe_read,
+BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
+	   const void *, unsafe_ptr)
+{
+	return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false);
+}
+
+static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
+	.func		= bpf_probe_read_kernel_str,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
+	   const void *, unsafe_ptr)
+{
+	return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true);
+}
+
+static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
+	.func		= bpf_probe_read_compat_str,
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
@@ -583,41 +699,6 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
-	   const void *, unsafe_ptr)
-{
-	int ret;
-
-	ret = security_locked_down(LOCKDOWN_BPF_READ);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * The strncpy_from_unsafe() call will likely not fill the entire
-	 * buffer, but that's okay in this circumstance as we're probing
-	 * arbitrary memory anyway similar to bpf_probe_read() and might
-	 * as well probe the stack. Thus, memory is explicitly cleared
-	 * only in error case, so that improper users ignoring return
-	 * code altogether don't copy garbage; otherwise length of string
-	 * is returned that can be used for bpf_perf_event_output() et al.
-	 */
-	ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
-	if (unlikely(ret < 0))
-out:
-		memset(dst, 0, size);
-
-	return ret;
-}
-
-static const struct bpf_func_proto bpf_probe_read_str_proto = {
-	.func		= bpf_probe_read_str,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
-	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
-	.arg3_type	= ARG_ANYTHING,
-};
-
 struct send_signal_irq_work {
 	struct irq_work irq_work;
 	struct task_struct *task;
@@ -697,8 +778,6 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_map_pop_elem_proto;
 	case BPF_FUNC_map_peek_elem:
 		return &bpf_map_peek_elem_proto;
-	case BPF_FUNC_probe_read:
-		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_tail_call:
@@ -725,8 +804,18 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_current_task_under_cgroup_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_probe_read_user:
+		return &bpf_probe_read_user_proto;
+	case BPF_FUNC_probe_read_kernel:
+		return &bpf_probe_read_kernel_proto;
+	case BPF_FUNC_probe_read:
+		return &bpf_probe_read_compat_proto;
+	case BPF_FUNC_probe_read_user_str:
+		return &bpf_probe_read_user_str_proto;
+	case BPF_FUNC_probe_read_kernel_str:
+		return &bpf_probe_read_kernel_str_proto;
 	case BPF_FUNC_probe_read_str:
-		return &bpf_probe_read_str_proto;
+		return &bpf_probe_read_compat_str_proto;
 #ifdef CONFIG_CGROUPS
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index a6bf19dabaab..df6809a76404 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -563,10 +563,13 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
- * 		address *src* and store the data in *dst*.
+ * 		kernel space address *unsafe_ptr* and store the data in *dst*.
+ *
+ * 		Generally, use bpf_probe_read_user() or bpf_probe_read_kernel()
+ * 		instead.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
@@ -1428,45 +1431,14 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
  * 	Description
- * 		Copy a NUL terminated string from an unsafe address
- * 		*unsafe_ptr* to *dst*. The *size* should include the
- * 		terminating NUL byte. In case the string length is smaller than
- * 		*size*, the target is not padded with further NUL bytes. If the
- * 		string length is larger than *size*, just *size*-1 bytes are
- * 		copied and the last byte is set to NUL.
- *
- * 		On success, the length of the copied string is returned. This
- * 		makes this helper useful in tracing programs for reading
- * 		strings, and more importantly to get its length at runtime. See
- * 		the following snippet:
- *
- * 		::
- *
- * 			SEC("kprobe/sys_open")
- * 			void bpf_sys_open(struct pt_regs *ctx)
- * 			{
- * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
- * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
- * 				                             ctx->di);
- *
- * 				// Consume buf, for example push it to
- * 				// userspace via bpf_perf_event_output(); we
- * 				// can use res (the string length) as event
- * 				// size, after checking its boundaries.
- * 			}
- *
- * 		In comparison, using **bpf_probe_read()** helper here instead
- * 		to read the string would require to estimate the length at
- * 		compile time, and would often result in copying more memory
- * 		than necessary.
+ * 		Copy a NUL terminated string from an unsafe kernel address
+ * 		*unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for
+ * 		more details.
  *
- * 		Another useful use case is when parsing individual process
- * 		arguments or individual environment variables navigating
- * 		*current*\ **->mm->arg_start** and *current*\
- * 		**->mm->env_start**: using this helper and the return value,
- * 		one can quickly iterate at the right offset of the memory area.
+ * 		Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str()
+ * 		instead.
  * 	Return
  * 		On success, the strictly positive length of the string,
  * 		including the trailing NUL character. On error, a negative
@@ -2777,6 +2749,72 @@ union bpf_attr {
  * 		restricted to raw_tracepoint bpf programs.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr)
+ * 	Description
+ * 		Safely attempt to read *size* bytes from user space address
+ * 		*unsafe_ptr* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
+ * 	Description
+ * 		Safely attempt to read *size* bytes from kernel space address
+ * 		*unsafe_ptr* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe user address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_user_str(buf, sizeof(buf),
+ * 				                                  ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read_user()** helper here
+ * 		instead to read the string would require to estimate the length
+ * 		at compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ *
+ * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
+ * 		to *dst*. Same semantics as with bpf_probe_read_user_str() apply.
+ * 	Return
+ * 		On success, the strictly positive length of the string,	including
+ * 		the trailing NUL character. On error, a negative value.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2890,7 +2928,11 @@ union bpf_attr {
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
 	FN(tcp_gen_syncookie),		\
-	FN(skb_output),
+	FN(skb_output),			\
+	FN(probe_read_user),		\
+	FN(probe_read_kernel),		\
+	FN(probe_read_user_str),	\
+	FN(probe_read_kernel_str),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3-59-g8ed1b


From 0c65b2b90d13c1deaee6449304dd367c5d4eb8ae Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 4 Nov 2019 02:40:33 +0100
Subject: net: of_get_phy_mode: Change API to solve int/unit warnings

Before this change of_get_phy_mode() returned an enum,
phy_interface_t. On error, -ENODEV etc, is returned. If the result of
the function is stored in a variable of type phy_interface_t, and the
compiler has decided to represent this as an unsigned int, comparision
with -ENODEV etc, is a signed vs unsigned comparision.

Fix this problem by changing the API. Make the function return an
error, or 0 on success, and pass a pointer, of type phy_interface_t,
where the phy mode should be stored.

v2:
Return with *interface set to PHY_INTERFACE_MODE_NA on error.
Add error checks to all users of of_get_phy_mode()
Fixup a few reverse christmas tree errors
Fixup a few slightly malformed reverse christmas trees

v3:
Fix 0-day reported errors.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c                             |  7 ++++---
 drivers/net/dsa/microchip/ksz_common.c                |  7 ++++---
 drivers/net/dsa/mt7530.c                              |  8 ++++++--
 drivers/net/dsa/qca8k.c                               |  9 +++++----
 drivers/net/dsa/sja1105/sja1105_main.c                |  7 ++++---
 drivers/net/ethernet/altera/altera_tse_main.c         |  6 +++---
 drivers/net/ethernet/arc/emac_arc.c                   | 15 ++++++++++-----
 drivers/net/ethernet/arc/emac_rockchip.c              |  7 +++++--
 drivers/net/ethernet/atheros/ag71xx.c                 |  5 ++---
 drivers/net/ethernet/aurora/nb8800.c                  |  4 ++--
 drivers/net/ethernet/aurora/nb8800.h                  |  2 +-
 drivers/net/ethernet/broadcom/bcmsysport.c            |  4 ++--
 drivers/net/ethernet/broadcom/genet/bcmmii.c          |  8 ++++----
 drivers/net/ethernet/cadence/macb_main.c              |  7 ++++---
 drivers/net/ethernet/faraday/ftgmac100.c              |  6 +++---
 drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c      |  7 ++++---
 drivers/net/ethernet/freescale/enetc/enetc_pf.c       |  4 ++--
 drivers/net/ethernet/freescale/fec_main.c             |  7 ++++---
 drivers/net/ethernet/freescale/fman/mac.c             |  6 +++---
 drivers/net/ethernet/freescale/gianfar.c              |  7 ++++---
 drivers/net/ethernet/hisilicon/hip04_eth.c            |  7 +++----
 drivers/net/ethernet/hisilicon/hix5hd2_gmac.c         |  5 ++---
 drivers/net/ethernet/ibm/emac/core.c                  |  5 +++--
 drivers/net/ethernet/marvell/mv643xx_eth.c            |  7 ++++---
 drivers/net/ethernet/marvell/mvneta.c                 |  7 +++----
 drivers/net/ethernet/marvell/pxa168_eth.c             |  4 +++-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c           |  8 ++++----
 drivers/net/ethernet/mscc/ocelot_board.c              | 12 ++++++------
 drivers/net/ethernet/ni/nixge.c                       |  5 ++---
 drivers/net/ethernet/renesas/ravb_main.c              |  4 +++-
 drivers/net/ethernet/renesas/sh_eth.c                 |  7 ++++---
 drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c   |  5 ++++-
 drivers/net/ethernet/socionext/sni_ave.c              |  6 +++---
 drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c   | 10 +++++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c   |  5 +++--
 drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c  |  9 +++++----
 drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c   |  5 ++---
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c        |  4 ++--
 drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c       |  9 +++++++--
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c     |  7 ++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c     |  8 ++++++--
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c |  6 +++---
 drivers/net/ethernet/ti/cpsw.c                        |  5 ++---
 drivers/net/ethernet/ti/cpsw_priv.h                   |  2 +-
 drivers/net/ethernet/ti/netcp_ethss.c                 |  5 +++--
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c     |  6 ++----
 drivers/of/of_mdio.c                                  |  4 ++--
 drivers/of/of_net.c                                   | 16 +++++++++++-----
 include/linux/of_net.h                                |  7 +++++--
 include/linux/stmmac.h                                |  3 ++-
 include/linux/sxgbe_platform.h                        |  4 +++-
 net/dsa/port.c                                        | 13 +++++++------
 net/dsa/slave.c                                       |  7 ++++---
 53 files changed, 201 insertions(+), 149 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 9add84c79dd6..67125a5487e1 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -381,8 +381,9 @@ static void bcm_sf2_identify_ports(struct bcm_sf2_priv *priv,
 				   struct device_node *dn)
 {
 	struct device_node *port;
-	int mode;
 	unsigned int port_num;
+	phy_interface_t mode;
+	int err;
 
 	priv->moca_port = -1;
 
@@ -395,8 +396,8 @@ static void bcm_sf2_identify_ports(struct bcm_sf2_priv *priv,
 		 * has completed, since they might be turned off at that
 		 * time
 		 */
-		mode = of_get_phy_mode(port);
-		if (mode < 0)
+		err = of_get_phy_mode(port, &mode);
+		if (err)
 			continue;
 
 		if (mode == PHY_INTERFACE_MODE_INTERNAL)
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 5d08e4430824..d8fda4a02640 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -422,6 +422,7 @@ EXPORT_SYMBOL(ksz_switch_alloc);
 int ksz_switch_register(struct ksz_device *dev,
 			const struct ksz_dev_ops *ops)
 {
+	phy_interface_t interface;
 	int ret;
 
 	if (dev->pdata)
@@ -456,9 +457,9 @@ int ksz_switch_register(struct ksz_device *dev,
 	 * device tree.
 	 */
 	if (dev->dev->of_node) {
-		ret = of_get_phy_mode(dev->dev->of_node);
-		if (ret >= 0)
-			dev->interface = ret;
+		ret = of_get_phy_mode(dev->dev->of_node, &interface);
+		if (ret == 0)
+			dev->interface = interface;
 		dev->synclko_125 = of_property_read_bool(dev->dev->of_node,
 							 "microchip,synclko-125");
 	}
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index add9e4279176..ed1ec10ec62b 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -1340,7 +1340,9 @@ mt7530_setup(struct dsa_switch *ds)
 
 	if (!dsa_is_unused_port(ds, 5)) {
 		priv->p5_intf_sel = P5_INTF_SEL_GMAC5;
-		interface = of_get_phy_mode(dsa_to_port(ds, 5)->dn);
+		ret = of_get_phy_mode(dsa_to_port(ds, 5)->dn, &interface);
+		if (ret && ret != -ENODEV)
+			return ret;
 	} else {
 		/* Scan the ethernet nodes. look for GMAC1, lookup used phy */
 		for_each_child_of_node(dn, mac_np) {
@@ -1354,7 +1356,9 @@ mt7530_setup(struct dsa_switch *ds)
 
 			phy_node = of_parse_phandle(mac_np, "phy-handle", 0);
 			if (phy_node->parent == priv->dev->of_node->parent) {
-				interface = of_get_phy_mode(mac_np);
+				ret = of_get_phy_mode(mac_np, &interface);
+				if (ret && ret != -ENODEV)
+					return ret;
 				id = of_mdio_parse_addr(ds->dev, phy_node);
 				if (id == 0)
 					priv->p5_intf_sel = P5_INTF_SEL_PHY_P0;
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 36c6ed98f8e7..e548289df31e 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -639,7 +639,8 @@ static int
 qca8k_setup(struct dsa_switch *ds)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
-	int ret, i, phy_mode = -1;
+	phy_interface_t phy_mode = PHY_INTERFACE_MODE_NA;
+	int ret, i;
 	u32 mask;
 
 	/* Make sure that port 0 is the cpu port */
@@ -661,10 +662,10 @@ qca8k_setup(struct dsa_switch *ds)
 		return ret;
 
 	/* Initialize CPU port pad mode (xMII type, delays...) */
-	phy_mode = of_get_phy_mode(dsa_to_port(ds, QCA8K_CPU_PORT)->dn);
-	if (phy_mode < 0) {
+	ret = of_get_phy_mode(dsa_to_port(ds, QCA8K_CPU_PORT)->dn, &phy_mode);
+	if (ret) {
 		pr_err("Can't find phy-mode for master device\n");
-		return phy_mode;
+		return ret;
 	}
 	ret = qca8k_set_pad_ctrl(priv, QCA8K_CPU_PORT, phy_mode);
 	if (ret < 0)
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index 2ae84a9dea59..d5dfda335aa1 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -584,8 +584,9 @@ static int sja1105_parse_ports_node(struct sja1105_private *priv,
 
 	for_each_child_of_node(ports_node, child) {
 		struct device_node *phy_node;
-		int phy_mode;
+		phy_interface_t phy_mode;
 		u32 index;
+		int err;
 
 		/* Get switch port number from DT */
 		if (of_property_read_u32(child, "reg", &index) < 0) {
@@ -596,8 +597,8 @@ static int sja1105_parse_ports_node(struct sja1105_private *priv,
 		}
 
 		/* Get PHY mode from DT */
-		phy_mode = of_get_phy_mode(child);
-		if (phy_mode < 0) {
+		err = of_get_phy_mode(child, &phy_mode);
+		if (err) {
 			dev_err(dev, "Failed to read phy-mode or "
 				"phy-interface-type property for port %d\n",
 				index);
diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c
index bb032be7fe31..4cd53fc338b5 100644
--- a/drivers/net/ethernet/altera/altera_tse_main.c
+++ b/drivers/net/ethernet/altera/altera_tse_main.c
@@ -730,12 +730,12 @@ static int altera_tse_phy_get_addr_mdio_create(struct net_device *dev)
 {
 	struct altera_tse_private *priv = netdev_priv(dev);
 	struct device_node *np = priv->device->of_node;
-	int ret = 0;
+	int ret;
 
-	priv->phy_iface = of_get_phy_mode(np);
+	ret = of_get_phy_mode(np, &priv->phy_iface);
 
 	/* Avoid get phy addr and create mdio if no phy is present */
-	if (!priv->phy_iface)
+	if (ret)
 		return 0;
 
 	/* try to get PHY address from device tree, use PHY autodetection if
diff --git a/drivers/net/ethernet/arc/emac_arc.c b/drivers/net/ethernet/arc/emac_arc.c
index 78e52d217e56..539166112993 100644
--- a/drivers/net/ethernet/arc/emac_arc.c
+++ b/drivers/net/ethernet/arc/emac_arc.c
@@ -20,9 +20,10 @@
 static int emac_arc_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
-	struct net_device *ndev;
 	struct arc_emac_priv *priv;
-	int interface, err;
+	phy_interface_t interface;
+	struct net_device *ndev;
+	int err;
 
 	if (!dev->of_node)
 		return -ENODEV;
@@ -37,9 +38,13 @@ static int emac_arc_probe(struct platform_device *pdev)
 	priv->drv_name = DRV_NAME;
 	priv->drv_version = DRV_VERSION;
 
-	interface = of_get_phy_mode(dev->of_node);
-	if (interface < 0)
-		interface = PHY_INTERFACE_MODE_MII;
+	err = of_get_phy_mode(dev->of_node, &interface);
+	if (err) {
+		if (err == -ENODEV)
+			interface = PHY_INTERFACE_MODE_MII;
+		else
+			goto out_netdev;
+	}
 
 	priv->clk = devm_clk_get(dev, "hclk");
 	if (IS_ERR(priv->clk)) {
diff --git a/drivers/net/ethernet/arc/emac_rockchip.c b/drivers/net/ethernet/arc/emac_rockchip.c
index 664d664e0925..aae231c5224f 100644
--- a/drivers/net/ethernet/arc/emac_rockchip.c
+++ b/drivers/net/ethernet/arc/emac_rockchip.c
@@ -97,8 +97,9 @@ static int emac_rockchip_probe(struct platform_device *pdev)
 	struct net_device *ndev;
 	struct rockchip_priv_data *priv;
 	const struct of_device_id *match;
+	phy_interface_t interface;
 	u32 data;
-	int err, interface;
+	int err;
 
 	if (!pdev->dev.of_node)
 		return -ENODEV;
@@ -114,7 +115,9 @@ static int emac_rockchip_probe(struct platform_device *pdev)
 	priv->emac.drv_version = DRV_VERSION;
 	priv->emac.set_mac_speed = emac_rockchip_set_mac_speed;
 
-	interface = of_get_phy_mode(dev->of_node);
+	err = of_get_phy_mode(dev->of_node, &interface);
+	if (err)
+		goto out_netdev;
 
 	/* RK3036/RK3066/RK3188 SoCs only support RMII */
 	if (interface != PHY_INTERFACE_MODE_RMII) {
diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c
index 1b1a09095c0d..8f5021091eee 100644
--- a/drivers/net/ethernet/atheros/ag71xx.c
+++ b/drivers/net/ethernet/atheros/ag71xx.c
@@ -1744,10 +1744,9 @@ static int ag71xx_probe(struct platform_device *pdev)
 		eth_random_addr(ndev->dev_addr);
 	}
 
-	ag->phy_if_mode = of_get_phy_mode(np);
-	if (ag->phy_if_mode < 0) {
+	err = of_get_phy_mode(np, ag->phy_if_mode);
+	if (err) {
 		netif_err(ag, probe, ndev, "missing phy-mode property in DT\n");
-		err = ag->phy_if_mode;
 		goto err_free;
 	}
 
diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c
index 37752d9514e7..30b455013bf3 100644
--- a/drivers/net/ethernet/aurora/nb8800.c
+++ b/drivers/net/ethernet/aurora/nb8800.c
@@ -1371,8 +1371,8 @@ static int nb8800_probe(struct platform_device *pdev)
 	priv = netdev_priv(dev);
 	priv->base = base;
 
-	priv->phy_mode = of_get_phy_mode(pdev->dev.of_node);
-	if (priv->phy_mode < 0)
+	ret = of_get_phy_mode(pdev->dev.of_node, &priv->phy_mode);
+	if (ret)
 		priv->phy_mode = PHY_INTERFACE_MODE_RGMII;
 
 	priv->clk = devm_clk_get(&pdev->dev, NULL);
diff --git a/drivers/net/ethernet/aurora/nb8800.h b/drivers/net/ethernet/aurora/nb8800.h
index aacc3cce2cc0..40941fb6065b 100644
--- a/drivers/net/ethernet/aurora/nb8800.h
+++ b/drivers/net/ethernet/aurora/nb8800.h
@@ -287,7 +287,7 @@ struct nb8800_priv {
 	struct device_node		*phy_node;
 
 	/* PHY connection type from DT */
-	int				phy_mode;
+	phy_interface_t			phy_mode;
 
 	/* Current link status */
 	int				speed;
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index a977a459bd20..825af709708e 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2479,9 +2479,9 @@ static int bcm_sysport_probe(struct platform_device *pdev)
 	priv->netdev = dev;
 	priv->pdev = pdev;
 
-	priv->phy_interface = of_get_phy_mode(dn);
+	ret = of_get_phy_mode(dn, &priv->phy_interface);
 	/* Default to GMII interface mode */
-	if ((int)priv->phy_interface < 0)
+	if (ret)
 		priv->phy_interface = PHY_INTERFACE_MODE_GMII;
 
 	/* In the case of a fixed PHY, the DT node associated
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 17bb8d60a157..b797a7e59a53 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -436,7 +436,7 @@ static int bcmgenet_mii_of_init(struct bcmgenet_priv *priv)
 	struct device_node *dn = priv->pdev->dev.of_node;
 	struct device *kdev = &priv->pdev->dev;
 	struct phy_device *phydev;
-	int phy_mode;
+	phy_interface_t phy_mode;
 	int ret;
 
 	/* Fetch the PHY phandle */
@@ -454,10 +454,10 @@ static int bcmgenet_mii_of_init(struct bcmgenet_priv *priv)
 	}
 
 	/* Get the link mode */
-	phy_mode = of_get_phy_mode(dn);
-	if (phy_mode < 0) {
+	ret = of_get_phy_mode(dn, &phy_mode);
+	if (ret) {
 		dev_err(kdev, "invalid PHY mode property\n");
-		return phy_mode;
+		return ret;
 	}
 
 	priv->phy_interface = phy_mode;
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 1e1b774e1953..b884cf7f339b 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -4182,6 +4182,7 @@ static int macb_probe(struct platform_device *pdev)
 	unsigned int queue_mask, num_queues;
 	bool native_io;
 	struct phy_device *phydev;
+	phy_interface_t interface;
 	struct net_device *dev;
 	struct resource *regs;
 	void __iomem *mem;
@@ -4308,12 +4309,12 @@ static int macb_probe(struct platform_device *pdev)
 		macb_get_hwaddr(bp);
 	}
 
-	err = of_get_phy_mode(np);
-	if (err < 0)
+	err = of_get_phy_mode(np, &interface);
+	if (err)
 		/* not found in DT, MII by default */
 		bp->phy_interface = PHY_INTERFACE_MODE_MII;
 	else
-		bp->phy_interface = err;
+		bp->phy_interface = interface;
 
 	/* IP specific init */
 	err = init(pdev);
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index da0c506349d1..a6f2063f1475 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1612,7 +1612,7 @@ static int ftgmac100_setup_mdio(struct net_device *netdev)
 {
 	struct ftgmac100 *priv = netdev_priv(netdev);
 	struct platform_device *pdev = to_platform_device(priv->dev);
-	int phy_intf = PHY_INTERFACE_MODE_RGMII;
+	phy_interface_t phy_intf = PHY_INTERFACE_MODE_RGMII;
 	struct device_node *np = pdev->dev.of_node;
 	int i, err = 0;
 	u32 reg;
@@ -1637,8 +1637,8 @@ static int ftgmac100_setup_mdio(struct net_device *netdev)
 	/* Get PHY mode from device-tree */
 	if (np) {
 		/* Default to RGMII. It's a gigabit part after all */
-		phy_intf = of_get_phy_mode(np);
-		if (phy_intf < 0)
+		err = of_get_phy_mode(np, &phy_intf);
+		if (err)
 			phy_intf = PHY_INTERFACE_MODE_RGMII;
 
 		/* Aspeed only supports these. I don't know about other IP
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
index fea388d86f20..b713739f4804 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
@@ -44,10 +44,11 @@ static struct device_node *dpaa2_mac_get_node(u16 dpmac_id)
 static int dpaa2_mac_get_if_mode(struct device_node *node,
 				 struct dpmac_attr attr)
 {
-	int if_mode;
+	phy_interface_t if_mode;
+	int err;
 
-	if_mode = of_get_phy_mode(node);
-	if (if_mode >= 0)
+	err = of_get_phy_mode(node, &if_mode);
+	if (!err)
 		return if_mode;
 
 	if_mode = phy_mode(attr.eth_if);
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index b73421c3e25b..7da79b816416 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
@@ -784,8 +784,8 @@ static int enetc_of_get_phy(struct enetc_ndev_priv *priv)
 		}
 	}
 
-	priv->if_mode = of_get_phy_mode(np);
-	if ((int)priv->if_mode < 0) {
+	err = of_get_phy_mode(np, &priv->if_mode);
+	if (err) {
 		dev_err(priv->dev, "missing phy type\n");
 		of_node_put(priv->phy_node);
 		if (of_phy_is_fixed_link(np))
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 7d37ba9f6819..d4d6c2e941f1 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3393,6 +3393,7 @@ fec_probe(struct platform_device *pdev)
 {
 	struct fec_enet_private *fep;
 	struct fec_platform_data *pdata;
+	phy_interface_t interface;
 	struct net_device *ndev;
 	int i, irq, ret = 0;
 	const struct of_device_id *of_id;
@@ -3465,15 +3466,15 @@ fec_probe(struct platform_device *pdev)
 	}
 	fep->phy_node = phy_node;
 
-	ret = of_get_phy_mode(pdev->dev.of_node);
-	if (ret < 0) {
+	ret = of_get_phy_mode(pdev->dev.of_node, &interface);
+	if (ret) {
 		pdata = dev_get_platdata(&pdev->dev);
 		if (pdata)
 			fep->phy_interface = pdata->phy;
 		else
 			fep->phy_interface = PHY_INTERFACE_MODE_MII;
 	} else {
-		fep->phy_interface = ret;
+		fep->phy_interface = interface;
 	}
 
 	fep->clk_ipg = devm_clk_get(&pdev->dev, "ipg");
diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index 7ab8095db192..f0806ace1ae2 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -608,7 +608,7 @@ static int mac_probe(struct platform_device *_of_dev)
 	const u8		*mac_addr;
 	u32			 val;
 	u8			fman_id;
-	int			phy_if;
+	phy_interface_t          phy_if;
 
 	dev = &_of_dev->dev;
 	mac_node = dev->of_node;
@@ -776,8 +776,8 @@ static int mac_probe(struct platform_device *_of_dev)
 	}
 
 	/* Get the PHY connection type */
-	phy_if = of_get_phy_mode(mac_node);
-	if (phy_if < 0) {
+	err = of_get_phy_mode(mac_node, &phy_if);
+	if (err) {
 		dev_warn(dev,
 			 "of_get_phy_mode() for %pOF failed. Defaulting to SGMII\n",
 			 mac_node);
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 51ad86417cb1..72868a28b621 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -641,6 +641,7 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 	const char *model;
 	const void *mac_addr;
 	int err = 0, i;
+	phy_interface_t interface;
 	struct net_device *dev = NULL;
 	struct gfar_private *priv = NULL;
 	struct device_node *np = ofdev->dev.of_node;
@@ -805,9 +806,9 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
 	 * rgmii-id really needs to be specified. Other types can be
 	 * detected by hardware
 	 */
-	err = of_get_phy_mode(np);
-	if (err >= 0)
-		priv->interface = err;
+	err = of_get_phy_mode(np, &interface);
+	if (!err)
+		priv->interface = interface;
 	else
 		priv->interface = gfar_get_interface(dev);
 
diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
index 4606a7e4a6d1..3e9b6d543c77 100644
--- a/drivers/net/ethernet/hisilicon/hip04_eth.c
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -211,7 +211,7 @@ struct hip04_priv {
 #if defined(CONFIG_HI13X1_GMAC)
 	void __iomem *sysctrl_base;
 #endif
-	int phy_mode;
+	phy_interface_t phy_mode;
 	int chan;
 	unsigned int port;
 	unsigned int group;
@@ -961,10 +961,9 @@ static int hip04_mac_probe(struct platform_device *pdev)
 		goto init_fail;
 	}
 
-	priv->phy_mode = of_get_phy_mode(node);
-	if (priv->phy_mode < 0) {
+	ret = of_get_phy_mode(node, &priv->phy_mode);
+	if (ret) {
 		dev_warn(d, "not find phy-mode\n");
-		ret = -EINVAL;
 		goto init_fail;
 	}
 
diff --git a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c
index c41b19c760f8..247de9105d10 100644
--- a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c
+++ b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c
@@ -1193,10 +1193,9 @@ static int hix5hd2_dev_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_free_mdio;
 
-	priv->phy_mode = of_get_phy_mode(node);
-	if ((int)priv->phy_mode < 0) {
+	ret = of_get_phy_mode(node, &priv->phy_mode);
+	if (ret) {
 		netdev_err(ndev, "not find phy-mode\n");
-		ret = -EINVAL;
 		goto err_mdiobus;
 	}
 
diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 9e43c9ace9c2..2e40425d8a34 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -2849,6 +2849,7 @@ static int emac_init_config(struct emac_instance *dev)
 {
 	struct device_node *np = dev->ofdev->dev.of_node;
 	const void *p;
+	int err;
 
 	/* Read config from device-tree */
 	if (emac_read_uint_prop(np, "mal-device", &dev->mal_ph, 1))
@@ -2897,8 +2898,8 @@ static int emac_init_config(struct emac_instance *dev)
 		dev->mal_burst_size = 256;
 
 	/* PHY mode needs some decoding */
-	dev->phy_mode = of_get_phy_mode(np);
-	if (dev->phy_mode < 0)
+	err = of_get_phy_mode(np, &dev->phy_mode);
+	if (err)
 		dev->phy_mode = PHY_INTERFACE_MODE_NA;
 
 	/* Check EMAC version */
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 82ea55ae5053..d5b644131cff 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -2959,15 +2959,16 @@ static void set_params(struct mv643xx_eth_private *mp,
 static int get_phy_mode(struct mv643xx_eth_private *mp)
 {
 	struct device *dev = mp->dev->dev.parent;
-	int iface = -1;
+	phy_interface_t iface;
+	int err;
 
 	if (dev->of_node)
-		iface = of_get_phy_mode(dev->of_node);
+		err = of_get_phy_mode(dev->of_node, &iface);
 
 	/* Historical default if unspecified. We could also read/write
 	 * the interface state in the PSC1
 	 */
-	if (iface < 0)
+	if (!dev->of_node || err)
 		iface = PHY_INTERFACE_MODE_GMII;
 	return iface;
 }
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 8f9df6efda61..274ac39c0f0f 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4797,9 +4797,9 @@ static int mvneta_probe(struct platform_device *pdev)
 	struct phy *comphy;
 	const char *dt_mac_addr;
 	char hw_mac_addr[ETH_ALEN];
+	phy_interface_t phy_mode;
 	const char *mac_from;
 	int tx_csum_limit;
-	int phy_mode;
 	int err;
 	int cpu;
 
@@ -4812,10 +4812,9 @@ static int mvneta_probe(struct platform_device *pdev)
 	if (dev->irq == 0)
 		return -EINVAL;
 
-	phy_mode = of_get_phy_mode(dn);
-	if (phy_mode < 0) {
+	err = of_get_phy_mode(dn, &phy_mode);
+	if (err) {
 		dev_err(&pdev->dev, "incorrect phy-mode\n");
-		err = -EINVAL;
 		goto err_free_irq;
 	}
 
diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c
index 51b77c2de400..3fb7ee3d4d13 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c
@@ -1489,8 +1489,10 @@ static int pxa168_eth_probe(struct platform_device *pdev)
 			goto err_netdev;
 		}
 		of_property_read_u32(np, "reg", &pep->phy_addr);
-		pep->phy_intf = of_get_phy_mode(pdev->dev.of_node);
 		of_node_put(np);
+		err = of_get_phy_mode(pdev->dev.of_node, &pep->phy_intf);
+		if (err && err != -ENODEV)
+			goto err_netdev;
 	}
 
 	/* Hardware supports only 3 ports */
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 703adb96429e..385a4ab9ec99 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2758,9 +2758,10 @@ static const struct net_device_ops mtk_netdev_ops = {
 static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np)
 {
 	const __be32 *_id = of_get_property(np, "reg", NULL);
+	phy_interface_t phy_mode;
 	struct phylink *phylink;
-	int phy_mode, id, err;
 	struct mtk_mac *mac;
+	int id, err;
 
 	if (!_id) {
 		dev_err(eth->dev, "missing mac id\n");
@@ -2805,10 +2806,9 @@ static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np)
 	mac->hw_stats->reg_offset = id * MTK_STAT_OFFSET;
 
 	/* phylink create */
-	phy_mode = of_get_phy_mode(np);
-	if (phy_mode < 0) {
+	err = of_get_phy_mode(np, &phy_mode);
+	if (err) {
 		dev_err(eth->dev, "incorrect phy-mode\n");
-		err = -EINVAL;
 		goto free_netdev;
 	}
 
diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
index aac115136720..723724bdc139 100644
--- a/drivers/net/ethernet/mscc/ocelot_board.c
+++ b/drivers/net/ethernet/mscc/ocelot_board.c
@@ -364,12 +364,12 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 
 	for_each_available_child_of_node(ports, portnp) {
 		struct device_node *phy_node;
+		phy_interface_t phy_mode;
 		struct phy_device *phy;
 		struct resource *res;
 		struct phy *serdes;
 		void __iomem *regs;
 		char res_name[8];
-		int phy_mode;
 		u32 port;
 
 		if (of_property_read_u32(portnp, "reg", &port))
@@ -398,11 +398,11 @@ static int mscc_ocelot_probe(struct platform_device *pdev)
 			goto out_put_ports;
 		}
 
-		phy_mode = of_get_phy_mode(portnp);
-		if (phy_mode < 0)
-			ocelot->ports[port]->phy_mode = PHY_INTERFACE_MODE_NA;
-		else
-			ocelot->ports[port]->phy_mode = phy_mode;
+		err = of_get_phy_mode(portnp, &phy_mode);
+		if (err && err != -ENODEV)
+			goto out_put_ports;
+
+		ocelot->ports[port]->phy_mode = phy_mode;
 
 		switch (ocelot->ports[port]->phy_mode) {
 		case PHY_INTERFACE_MODE_NA:
diff --git a/drivers/net/ethernet/ni/nixge.c b/drivers/net/ethernet/ni/nixge.c
index 2761f3a3ae50..49c7987c2abd 100644
--- a/drivers/net/ethernet/ni/nixge.c
+++ b/drivers/net/ethernet/ni/nixge.c
@@ -1346,10 +1346,9 @@ static int nixge_probe(struct platform_device *pdev)
 		}
 	}
 
-	priv->phy_mode = of_get_phy_mode(pdev->dev.of_node);
-	if ((int)priv->phy_mode < 0) {
+	err = of_get_phy_mode(pdev->dev.of_node, &priv->phy_mode);
+	if (err) {
 		netdev_err(ndev, "not find \"phy-mode\" property\n");
-		err = -EINVAL;
 		goto unregister_mdio;
 	}
 
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index de9aa8c47f1c..5ea14b5fbed8 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -2046,7 +2046,9 @@ static int ravb_probe(struct platform_device *pdev)
 	spin_lock_init(&priv->lock);
 	INIT_WORK(&priv->work, ravb_tx_timeout_work);
 
-	priv->phy_interface = of_get_phy_mode(np);
+	error = of_get_phy_mode(np, &priv->phy_interface);
+	if (error && error != -ENODEV)
+		goto out_release;
 
 	priv->no_avb_link = of_property_read_bool(np, "renesas,no-ether-link");
 	priv->avb_link_active_low =
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 7ba35a0bdb29..e19b49c4013e 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3183,6 +3183,7 @@ static struct sh_eth_plat_data *sh_eth_parse_dt(struct device *dev)
 {
 	struct device_node *np = dev->of_node;
 	struct sh_eth_plat_data *pdata;
+	phy_interface_t interface;
 	const char *mac_addr;
 	int ret;
 
@@ -3190,10 +3191,10 @@ static struct sh_eth_plat_data *sh_eth_parse_dt(struct device *dev)
 	if (!pdata)
 		return NULL;
 
-	ret = of_get_phy_mode(np);
-	if (ret < 0)
+	ret = of_get_phy_mode(np, &interface);
+	if (ret)
 		return NULL;
-	pdata->phy_interface = ret;
+	pdata->phy_interface = interface;
 
 	mac_addr = of_get_mac_address(np);
 	if (!IS_ERR(mac_addr))
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
index 2412c87561e0..33f79402850d 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
@@ -30,12 +30,15 @@ static int sxgbe_probe_config_dt(struct platform_device *pdev,
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct sxgbe_dma_cfg *dma_cfg;
+	int err;
 
 	if (!np)
 		return -ENODEV;
 
 	*mac = of_get_mac_address(np);
-	plat->interface = of_get_phy_mode(np);
+	err = of_get_phy_mode(np, &plat->interface);
+	if (err && err != -ENODEV)
+		return err;
 
 	plat->bus_id = of_alias_get_id(np, "ethernet");
 	if (plat->bus_id < 0)
diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c
index 6e984d5a729f..f7e927ad67fa 100644
--- a/drivers/net/ethernet/socionext/sni_ave.c
+++ b/drivers/net/ethernet/socionext/sni_ave.c
@@ -1565,10 +1565,10 @@ static int ave_probe(struct platform_device *pdev)
 		return -EINVAL;
 
 	np = dev->of_node;
-	phy_mode = of_get_phy_mode(np);
-	if ((int)phy_mode < 0) {
+	ret = of_get_phy_mode(np, &phy_mode);
+	if (ret) {
 		dev_err(dev, "phy-mode not found\n");
-		return -EINVAL;
+		return ret;
 	}
 
 	irq = platform_get_irq(pdev, 0);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c
index 527f93320a5a..d0d2d0fc5f0a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c
@@ -61,9 +61,10 @@ static void anarion_gmac_exit(struct platform_device *pdev, void *priv)
 
 static struct anarion_gmac *anarion_config_dt(struct platform_device *pdev)
 {
-	int phy_mode;
-	void __iomem *ctl_block;
 	struct anarion_gmac *gmac;
+	phy_interface_t phy_mode;
+	void __iomem *ctl_block;
+	int err;
 
 	ctl_block = devm_platform_ioremap_resource(pdev, 1);
 	if (IS_ERR(ctl_block)) {
@@ -78,7 +79,10 @@ static struct anarion_gmac *anarion_config_dt(struct platform_device *pdev)
 
 	gmac->ctl_block = (uintptr_t)ctl_block;
 
-	phy_mode = of_get_phy_mode(pdev->dev.of_node);
+	err = of_get_phy_mode(pdev->dev.of_node, &phy_mode);
+	if (err)
+		return ERR_PTR(err);
+
 	switch (phy_mode) {
 	case PHY_INTERFACE_MODE_RGMII:		/* Fall through */
 	case PHY_INTERFACE_MODE_RGMII_ID	/* Fall through */:
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
index 0d21082ceb93..6ae13dc19510 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
@@ -189,9 +189,10 @@ static int ipq806x_gmac_set_speed(struct ipq806x_gmac *gmac, unsigned int speed)
 static int ipq806x_gmac_of_parse(struct ipq806x_gmac *gmac)
 {
 	struct device *dev = &gmac->pdev->dev;
+	int ret;
 
-	gmac->phy_mode = of_get_phy_mode(dev->of_node);
-	if ((int)gmac->phy_mode < 0) {
+	ret = of_get_phy_mode(dev->of_node, &gmac->phy_mode);
+	if (ret) {
 		dev_err(dev, "missing phy mode property\n");
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
index cea7a0c7ce68..bdb80421acac 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
@@ -54,7 +54,7 @@ struct mediatek_dwmac_plat_data {
 	struct device_node *np;
 	struct regmap *peri_regmap;
 	struct device *dev;
-	int phy_mode;
+	phy_interface_t phy_mode;
 	bool rmii_rxc;
 };
 
@@ -243,6 +243,7 @@ static int mediatek_dwmac_config_dt(struct mediatek_dwmac_plat_data *plat)
 {
 	struct mac_delay_struct *mac_delay = &plat->mac_delay;
 	u32 tx_delay_ps, rx_delay_ps;
+	int err;
 
 	plat->peri_regmap = syscon_regmap_lookup_by_phandle(plat->np, "mediatek,pericfg");
 	if (IS_ERR(plat->peri_regmap)) {
@@ -250,10 +251,10 @@ static int mediatek_dwmac_config_dt(struct mediatek_dwmac_plat_data *plat)
 		return PTR_ERR(plat->peri_regmap);
 	}
 
-	plat->phy_mode = of_get_phy_mode(plat->np);
-	if (plat->phy_mode < 0) {
+	err = of_get_phy_mode(plat->np, &plat->phy_mode);
+	if (err) {
 		dev_err(plat->dev, "not find phy-mode\n");
-		return -EINVAL;
+		return err;
 	}
 
 	if (!of_property_read_u32(plat->np, "mediatek,tx-delay-ps", &tx_delay_ps)) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
index 306da8f6b7d5..bd6c01004913 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c
@@ -338,10 +338,9 @@ static int meson8b_dwmac_probe(struct platform_device *pdev)
 	}
 
 	dwmac->dev = &pdev->dev;
-	dwmac->phy_mode = of_get_phy_mode(pdev->dev.of_node);
-	if ((int)dwmac->phy_mode < 0) {
+	ret = of_get_phy_mode(pdev->dev.of_node, &dwmac->phy_mode);
+	if (ret) {
 		dev_err(&pdev->dev, "missing phy-mode property\n");
-		ret = -EINVAL;
 		goto err_remove_config_dt;
 	}
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index e2e469c37a4d..dc50ba13a746 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -37,7 +37,7 @@ struct rk_gmac_ops {
 
 struct rk_priv_data {
 	struct platform_device *pdev;
-	int phy_iface;
+	phy_interface_t phy_iface;
 	struct regulator *regulator;
 	bool suspended;
 	const struct rk_gmac_ops *ops;
@@ -1224,7 +1224,7 @@ static struct rk_priv_data *rk_gmac_setup(struct platform_device *pdev,
 	if (!bsp_priv)
 		return ERR_PTR(-ENOMEM);
 
-	bsp_priv->phy_iface = of_get_phy_mode(dev->of_node);
+	of_get_phy_mode(dev->of_node, &bsp_priv->phy_iface);
 	bsp_priv->ops = ops;
 
 	bsp_priv->regulator = devm_regulator_get_optional(dev, "phy");
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
index e9fd661f7995..e1b63df6f96f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c
@@ -116,7 +116,7 @@
 #define ETH_PHY_SEL_MII		0x0
 
 struct sti_dwmac {
-	int interface;		/* MII interface */
+	phy_interface_t interface;	/* MII interface */
 	bool ext_phyclk;	/* Clock from external PHY */
 	u32 tx_retime_src;	/* TXCLK Retiming*/
 	struct clk *clk;	/* PHY clock */
@@ -269,7 +269,12 @@ static int sti_dwmac_parse_data(struct sti_dwmac *dwmac,
 		return err;
 	}
 
-	dwmac->interface = of_get_phy_mode(np);
+	err = of_get_phy_mode(np, &dwmac->interface);
+	if (err && err != -ENODEV) {
+		dev_err(dev, "Can't get phy-mode\n");
+		return err;
+	}
+
 	dwmac->regmap = regmap;
 	dwmac->gmac_en = of_property_read_bool(np, "st,gmac_en");
 	dwmac->ext_phyclk = of_property_read_bool(np, "st,ext-phyclk");
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index ddcc191febdb..eefb06d918c8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -1105,6 +1105,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 	struct stmmac_resources stmmac_res;
 	struct sunxi_priv_data *gmac;
 	struct device *dev = &pdev->dev;
+	phy_interface_t interface;
 	int ret;
 	struct stmmac_priv *priv;
 	struct net_device *ndev;
@@ -1178,10 +1179,10 @@ static int sun8i_dwmac_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	ret = of_get_phy_mode(dev->of_node);
-	if (ret < 0)
+	ret = of_get_phy_mode(dev->of_node, &interface);
+	if (ret)
 		return -EINVAL;
-	plat_dat->interface = ret;
+	plat_dat->interface = interface;
 
 	/* platform data specifying hardware features and callbacks.
 	 * hardware features were copied from Allwinner drivers.
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
index a299da3971b4..26353ef616b8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c
@@ -18,7 +18,7 @@
 #include "stmmac_platform.h"
 
 struct sunxi_priv_data {
-	int interface;
+	phy_interface_t interface;
 	int clk_enabled;
 	struct clk *tx_clk;
 	struct regulator *regulator;
@@ -118,7 +118,11 @@ static int sun7i_gmac_probe(struct platform_device *pdev)
 		goto err_remove_config_dt;
 	}
 
-	gmac->interface = of_get_phy_mode(dev->of_node);
+	ret = of_get_phy_mode(dev->of_node, &gmac->interface);
+	if (ret && ret != -ENODEV) {
+		dev_err(dev, "Can't get phy-mode\n");
+		goto err_remove_config_dt;
+	}
 
 	gmac->tx_clk = devm_clk_get(dev, "allwinner_gmac_tx");
 	if (IS_ERR(gmac->tx_clk)) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 170c3a052b14..bedaff0c13bd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -412,9 +412,9 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		*mac = NULL;
 	}
 
-	plat->phy_interface = of_get_phy_mode(np);
-	if (plat->phy_interface < 0)
-		return ERR_PTR(plat->phy_interface);
+	rc = of_get_phy_mode(np, &plat->phy_interface);
+	if (rc)
+		return ERR_PTR(rc);
 
 	plat->interface = stmmac_of_get_mac_mode(np);
 	if (plat->interface < 0)
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index f298d714efd6..329671e66fe4 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -2619,11 +2619,10 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 				i);
 			goto no_phy_slave;
 		}
-		slave_data->phy_if = of_get_phy_mode(slave_node);
-		if (slave_data->phy_if < 0) {
+		ret = of_get_phy_mode(slave_node, &slave_data->phy_if);
+		if (ret) {
 			dev_err(&pdev->dev, "Missing or malformed slave[%d] phy-mode property\n",
 				i);
-			ret = slave_data->phy_if;
 			goto err_node_put;
 		}
 
diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
index 362c5a986869..8bfa761fa552 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.h
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -275,7 +275,7 @@ struct cpsw_slave_data {
 	struct device_node *slave_node;
 	struct device_node *phy_node;
 	char		phy_id[MII_BUS_ID_SIZE];
-	int		phy_if;
+	phy_interface_t	phy_if;
 	u8		mac_addr[ETH_ALEN];
 	u16		dual_emac_res_vlan;	/* Reserved VLAN for DualEMAC */
 	struct phy	*ifphy;
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 2c1fac33136c..86a3f42a3dcc 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -2291,6 +2291,7 @@ static int gbe_slave_open(struct gbe_intf *gbe_intf)
 	struct gbe_slave *slave = gbe_intf->slave;
 	phy_interface_t phy_mode;
 	bool has_phy = false;
+	int err;
 
 	void (*hndlr)(struct net_device *) = gbe_adjust_link;
 
@@ -2320,11 +2321,11 @@ static int gbe_slave_open(struct gbe_intf *gbe_intf)
 		slave->phy_port_t = PORT_MII;
 	} else if (slave->link_interface == RGMII_LINK_MAC_PHY) {
 		has_phy = true;
-		phy_mode = of_get_phy_mode(slave->node);
+		err = of_get_phy_mode(slave->node, &phy_mode);
 		/* if phy-mode is not present, default to
 		 * PHY_INTERFACE_MODE_RGMII
 		 */
-		if (phy_mode < 0)
+		if (err)
 			phy_mode = PHY_INTERFACE_MODE_RGMII;
 
 		if (!phy_interface_mode_is_rgmii(phy_mode)) {
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 676006f32f91..867726d696e2 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1761,11 +1761,9 @@ static int axienet_probe(struct platform_device *pdev)
 			goto free_netdev;
 		}
 	} else {
-		lp->phy_mode = of_get_phy_mode(pdev->dev.of_node);
-		if ((int)lp->phy_mode < 0) {
-			ret = -EINVAL;
+		ret = of_get_phy_mode(pdev->dev.of_node, &lp->phy_mode);
+		if (ret)
 			goto free_netdev;
-		}
 	}
 
 	/* Find the DMA node, map the DMA registers, and decode the DMA IRQs */
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index bd6129db6417..c6b87ce2b0cc 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -361,8 +361,8 @@ struct phy_device *of_phy_get_and_connect(struct net_device *dev,
 	struct phy_device *phy;
 	int ret;
 
-	iface = of_get_phy_mode(np);
-	if ((int)iface < 0)
+	ret = of_get_phy_mode(np, &iface);
+	if (ret)
 		return NULL;
 	if (of_phy_is_fixed_link(np)) {
 		ret = of_phy_register_fixed_link(np);
diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c
index b02734aff8c1..6e411821583e 100644
--- a/drivers/of/of_net.c
+++ b/drivers/of/of_net.c
@@ -15,16 +15,20 @@
 /**
  * of_get_phy_mode - Get phy mode for given device_node
  * @np:	Pointer to the given device_node
+ * @interface: Pointer to the result
  *
  * The function gets phy interface string from property 'phy-mode' or
- * 'phy-connection-type', and return its index in phy_modes table, or errno in
- * error case.
+ * 'phy-connection-type'. The index in phy_modes table is set in
+ * interface and 0 returned. In case of error interface is set to
+ * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV.
  */
-int of_get_phy_mode(struct device_node *np)
+int of_get_phy_mode(struct device_node *np, phy_interface_t *interface)
 {
 	const char *pm;
 	int err, i;
 
+	*interface = PHY_INTERFACE_MODE_NA;
+
 	err = of_property_read_string(np, "phy-mode", &pm);
 	if (err < 0)
 		err = of_property_read_string(np, "phy-connection-type", &pm);
@@ -32,8 +36,10 @@ int of_get_phy_mode(struct device_node *np)
 		return err;
 
 	for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++)
-		if (!strcasecmp(pm, phy_modes(i)))
-			return i;
+		if (!strcasecmp(pm, phy_modes(i))) {
+			*interface = i;
+			return 0;
+		}
 
 	return -ENODEV;
 }
diff --git a/include/linux/of_net.h b/include/linux/of_net.h
index 6aeaea1775e6..71bbfcf3adcd 100644
--- a/include/linux/of_net.h
+++ b/include/linux/of_net.h
@@ -6,15 +6,18 @@
 #ifndef __LINUX_OF_NET_H
 #define __LINUX_OF_NET_H
 
+#include <linux/phy.h>
+
 #ifdef CONFIG_OF_NET
 #include <linux/of.h>
 
 struct net_device;
-extern int of_get_phy_mode(struct device_node *np);
+extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface);
 extern const void *of_get_mac_address(struct device_node *np);
 extern struct net_device *of_find_net_device_by_node(struct device_node *np);
 #else
-static inline int of_get_phy_mode(struct device_node *np)
+static inline int of_get_phy_mode(struct device_node *np,
+				  phy_interface_t *interface)
 {
 	return -ENODEV;
 }
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 86f9464c3f5d..d4bcd9387136 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -13,6 +13,7 @@
 #define __STMMAC_PLATFORM_DATA
 
 #include <linux/platform_device.h>
+#include <linux/phy.h>
 
 #define MTL_MAX_RX_QUEUES	8
 #define MTL_MAX_TX_QUEUES	8
@@ -132,7 +133,7 @@ struct plat_stmmacenet_data {
 	int bus_id;
 	int phy_addr;
 	int interface;
-	int phy_interface;
+	phy_interface_t phy_interface;
 	struct stmmac_mdio_bus_data *mdio_bus_data;
 	struct device_node *phy_node;
 	struct device_node *phylink_node;
diff --git a/include/linux/sxgbe_platform.h b/include/linux/sxgbe_platform.h
index 267369110584..85ec745767bd 100644
--- a/include/linux/sxgbe_platform.h
+++ b/include/linux/sxgbe_platform.h
@@ -10,6 +10,8 @@
 #ifndef __SXGBE_PLATFORM_H__
 #define __SXGBE_PLATFORM_H__
 
+#include <linux/phy.h>
+
 /* MDC Clock Selection define*/
 #define SXGBE_CSR_100_150M	0x0	/* MDC = clk_scr_i/62 */
 #define SXGBE_CSR_150_250M	0x1	/* MDC = clk_scr_i/102 */
@@ -38,7 +40,7 @@ struct sxgbe_plat_data {
 	char *phy_bus_name;
 	int bus_id;
 	int phy_addr;
-	int interface;
+	phy_interface_t interface;
 	struct sxgbe_mdio_bus_data *mdio_bus_data;
 	struct sxgbe_dma_cfg *dma_cfg;
 	int clk_csr;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 9b54e5a76297..6e93c36bf0c0 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -561,7 +561,7 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp)
 	struct dsa_switch *ds = dp->ds;
 	struct phy_device *phydev;
 	int port = dp->index;
-	int mode;
+	phy_interface_t mode;
 	int err;
 
 	err = of_phy_register_fixed_link(dn);
@@ -574,8 +574,8 @@ static int dsa_port_fixed_link_register_of(struct dsa_port *dp)
 
 	phydev = of_phy_find_device(dn);
 
-	mode = of_get_phy_mode(dn);
-	if (mode < 0)
+	err = of_get_phy_mode(dn, &mode);
+	if (err)
 		mode = PHY_INTERFACE_MODE_NA;
 	phydev->interface = mode;
 
@@ -593,10 +593,11 @@ static int dsa_port_phylink_register(struct dsa_port *dp)
 {
 	struct dsa_switch *ds = dp->ds;
 	struct device_node *port_dn = dp->dn;
-	int mode, err;
+	phy_interface_t mode;
+	int err;
 
-	mode = of_get_phy_mode(port_dn);
-	if (mode < 0)
+	err = of_get_phy_mode(port_dn, &mode);
+	if (err)
 		mode = PHY_INTERFACE_MODE_NA;
 
 	dp->pl_config.dev = ds->dev;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index d18761649754..78ffc87dc25e 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1313,11 +1313,12 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
 	struct dsa_port *dp = dsa_slave_to_port(slave_dev);
 	struct device_node *port_dn = dp->dn;
 	struct dsa_switch *ds = dp->ds;
+	phy_interface_t mode;
 	u32 phy_flags = 0;
-	int mode, ret;
+	int ret;
 
-	mode = of_get_phy_mode(port_dn);
-	if (mode < 0)
+	ret = of_get_phy_mode(port_dn, &mode);
+	if (ret)
 		mode = PHY_INTERFACE_MODE_NA;
 
 	dp->pl_config.dev = &slave_dev->dev;
-- 
cgit v1.2.3-59-g8ed1b


From b6520fce073b619e6f2c0d510bb3481c9386c70b Mon Sep 17 00:00:00 2001
From: Kristian Evensen <kristian.evensen@gmail.com>
Date: Thu, 26 Sep 2019 12:06:45 +0200
Subject: netfilter: ipset: Add wildcard support to net,iface

The net,iface equal functions currently compares the full interface
names. In several cases, wildcard (or prefix) matching is useful. For
example, when converting a large iptables rule-set to make use of ipset,
I was able to significantly reduce the number of set elements by making
use of wildcard matching.

Wildcard matching is enabled by adding "wildcard" when adding an element
to a set. Internally, this causes the IPSET_FLAG_IFACE_WILDCARD-flag to
be set.  When this flag is set, only the initial part of the interface
name is used for comparison.

Wildcard matching is done per element and not per set, as there are many
cases where mixing wildcard and non-wildcard elements are useful. This
means that is up to the user to handle (avoid) overlapping interface
names.

Signed-off-by: Kristian Evensen <kristian.evensen@gmail.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
---
 include/uapi/linux/netfilter/ipset/ip_set.h |  2 ++
 net/netfilter/ipset/ip_set_hash_netiface.c  | 23 ++++++++++++++++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index eea166c52c36..11a72a938eb1 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -205,6 +205,8 @@ enum ipset_cadt_flags {
 	IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD),
 	IPSET_FLAG_BIT_WITH_SKBINFO = 6,
 	IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO),
+	IPSET_FLAG_BIT_IFACE_WILDCARD = 7,
+	IPSET_FLAG_IFACE_WILDCARD = (1 << IPSET_FLAG_BIT_IFACE_WILDCARD),
 	IPSET_FLAG_CADT_MAX	= 15,
 };
 
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 1a04e0929738..be5e95a0d876 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -25,7 +25,8 @@
 /*				3    Counters support added */
 /*				4    Comments support added */
 /*				5    Forceadd support added */
-#define IPSET_TYPE_REV_MAX	6 /* skbinfo support added */
+/*				6    skbinfo support added */
+#define IPSET_TYPE_REV_MAX	7 /* interface wildcard support added */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>");
@@ -57,6 +58,7 @@ struct hash_netiface4_elem {
 	u8 cidr;
 	u8 nomatch;
 	u8 elem;
+	u8 wildcard;
 	char iface[IFNAMSIZ];
 };
 
@@ -71,7 +73,9 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
 	       ip1->cidr == ip2->cidr &&
 	       (++*multi) &&
 	       ip1->physdev == ip2->physdev &&
-	       strcmp(ip1->iface, ip2->iface) == 0;
+	       (ip1->wildcard ?
+		strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 :
+		strcmp(ip1->iface, ip2->iface) == 0);
 }
 
 static int
@@ -103,7 +107,8 @@ static bool
 hash_netiface4_data_list(struct sk_buff *skb,
 			 const struct hash_netiface4_elem *data)
 {
-	u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+	u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) |
+		    (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0);
 
 	if (data->nomatch)
 		flags |= IPSET_FLAG_NOMATCH;
@@ -229,6 +234,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 			e.physdev = 1;
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
+		if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD)
+			e.wildcard = 1;
 	}
 	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
 		e.ip = htonl(ip & ip_set_hostmask(e.cidr));
@@ -280,6 +287,7 @@ struct hash_netiface6_elem {
 	u8 cidr;
 	u8 nomatch;
 	u8 elem;
+	u8 wildcard;
 	char iface[IFNAMSIZ];
 };
 
@@ -294,7 +302,9 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
 	       ip1->cidr == ip2->cidr &&
 	       (++*multi) &&
 	       ip1->physdev == ip2->physdev &&
-	       strcmp(ip1->iface, ip2->iface) == 0;
+	       (ip1->wildcard ?
+		strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 :
+		strcmp(ip1->iface, ip2->iface) == 0);
 }
 
 static int
@@ -326,7 +336,8 @@ static bool
 hash_netiface6_data_list(struct sk_buff *skb,
 			 const struct hash_netiface6_elem *data)
 {
-	u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
+	u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) |
+		    (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0);
 
 	if (data->nomatch)
 		flags |= IPSET_FLAG_NOMATCH;
@@ -440,6 +451,8 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
 			e.physdev = 1;
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
+		if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD)
+			e.wildcard = 1;
 	}
 
 	ret = adtfn(set, &e, &ext, &ext, flags);
-- 
cgit v1.2.3-59-g8ed1b


From 15122464d525f684a61806d28597050cdcef0f32 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Sat, 2 Nov 2019 01:12:03 +0100
Subject: icmp: add helpers to recognize ICMP error packets

Add two helper functions, one for IPv4 and one for IPv6, to recognize
the ICMP packets which are error responses.
This packets are special because they have as payload the original
header of the packet which generated it (RFC 792 says at least 8 bytes,
but Linux actually includes much more than that).

Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/icmp.h   | 15 +++++++++++++++
 include/linux/icmpv6.h | 14 ++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/icmp.h b/include/linux/icmp.h
index 2d8aaf7d4b9e..81ca84ce3119 100644
--- a/include/linux/icmp.h
+++ b/include/linux/icmp.h
@@ -20,4 +20,19 @@ static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb)
 {
 	return (struct icmphdr *)skb_transport_header(skb);
 }
+
+static inline bool icmp_is_err(int type)
+{
+	switch (type) {
+	case ICMP_DEST_UNREACH:
+	case ICMP_SOURCE_QUENCH:
+	case ICMP_REDIRECT:
+	case ICMP_TIME_EXCEEDED:
+	case ICMP_PARAMETERPROB:
+		return true;
+	}
+
+	return false;
+}
+
 #endif	/* _LINUX_ICMP_H */
diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index a8f888976137..ef1cbb5f454f 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -46,4 +46,18 @@ extern void				icmpv6_flow_init(struct sock *sk,
 							 const struct in6_addr *saddr,
 							 const struct in6_addr *daddr,
 							 int oif);
+
+static inline bool icmpv6_is_err(int type)
+{
+	switch (type) {
+	case ICMPV6_DEST_UNREACH:
+	case ICMPV6_PKT_TOOBIG:
+	case ICMPV6_TIME_EXCEED:
+	case ICMPV6_PARAMPROB:
+		return true;
+	}
+
+	return false;
+}
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 5cd73fbd78794d9c9c4e7a61dc8fa83489b43d03 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 5 Nov 2019 01:12:57 +0100
Subject: net: dsa: Add support for devlink resources

Add wrappers around the devlink resource API, so that DSA drivers can
register and unregister devlink resources.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 16 ++++++++++++++++
 net/dsa/dsa.c     | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index e4c697b95c70..9507611a41f0 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -586,6 +586,22 @@ int dsa_devlink_params_register(struct dsa_switch *ds,
 void dsa_devlink_params_unregister(struct dsa_switch *ds,
 				   const struct devlink_param *params,
 				   size_t params_count);
+int dsa_devlink_resource_register(struct dsa_switch *ds,
+				  const char *resource_name,
+				  u64 resource_size,
+				  u64 resource_id,
+				  u64 parent_resource_id,
+				  const struct devlink_resource_size_params *size_params);
+
+void dsa_devlink_resources_unregister(struct dsa_switch *ds);
+
+void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
+					   u64 resource_id,
+					   devlink_resource_occ_get_t *occ_get,
+					   void *occ_get_priv);
+void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
+					     u64 resource_id);
+
 struct dsa_devlink_priv {
 	struct dsa_switch *ds;
 };
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index db1c1c7e40e9..17281fec710c 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -379,6 +379,43 @@ void dsa_devlink_params_unregister(struct dsa_switch *ds,
 }
 EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister);
 
+int dsa_devlink_resource_register(struct dsa_switch *ds,
+				  const char *resource_name,
+				  u64 resource_size,
+				  u64 resource_id,
+				  u64 parent_resource_id,
+				  const struct devlink_resource_size_params *size_params)
+{
+	return devlink_resource_register(ds->devlink, resource_name,
+					 resource_size, resource_id,
+					 parent_resource_id,
+					 size_params);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_register);
+
+void dsa_devlink_resources_unregister(struct dsa_switch *ds)
+{
+	devlink_resources_unregister(ds->devlink, NULL);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister);
+
+void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
+					   u64 resource_id,
+					   devlink_resource_occ_get_t *occ_get,
+					   void *occ_get_priv)
+{
+	return devlink_resource_occ_get_register(ds->devlink, resource_id,
+						 occ_get, occ_get_priv);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register);
+
+void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
+					     u64 resource_id)
+{
+	devlink_resource_occ_get_unregister(ds->devlink, resource_id);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister);
+
 static int __init dsa_init_module(void)
 {
 	int rc;
-- 
cgit v1.2.3-59-g8ed1b


From 4d390c287b2f3fbd0bb64c52c1a9418f790986e1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Nov 2019 19:13:13 -0800
Subject: net_sched: do not export gnet_stats_basic_packed to uapi

gnet_stats_basic_packed was really meant to be private kernel structure.

If this proves to be a problem, we will have to rename the in-kernel
version.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gen_stats.h        | 6 ++++++
 include/uapi/linux/gen_stats.h | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index ca23860adbb9..5f3889e7ec1b 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -7,6 +7,12 @@
 #include <linux/rtnetlink.h>
 #include <linux/pkt_sched.h>
 
+/* Note: this used to be in include/uapi/linux/gen_stats.h */
+struct gnet_stats_basic_packed {
+	__u64	bytes;
+	__u32	packets;
+} __attribute__ ((packed));
+
 struct gnet_stats_basic_cpu {
 	struct gnet_stats_basic_packed bstats;
 	struct u64_stats_sync syncp;
diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h
index 065408e16a80..4eaacdf452e3 100644
--- a/include/uapi/linux/gen_stats.h
+++ b/include/uapi/linux/gen_stats.h
@@ -26,10 +26,6 @@ struct gnet_stats_basic {
 	__u64	bytes;
 	__u32	packets;
 };
-struct gnet_stats_basic_packed {
-	__u64	bytes;
-	__u32	packets;
-} __attribute__ ((packed));
 
 /**
  * struct gnet_stats_rate_est - rate estimator
-- 
cgit v1.2.3-59-g8ed1b


From d0083d98f685b9f4fe810570f93cef0b0bb6b354 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Nov 2019 19:13:14 -0800
Subject: net_sched: extend packet counter to 64bit

After this change, qdisc packet counter is no longer
a 32bit quantity. We still export 32bit values to user.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/gen_stats.h | 4 ++--
 net/core/gen_stats.c    | 3 +--
 net/sched/act_simple.c  | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 5f3889e7ec1b..1424e02cef90 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -10,8 +10,8 @@
 /* Note: this used to be in include/uapi/linux/gen_stats.h */
 struct gnet_stats_basic_packed {
 	__u64	bytes;
-	__u32	packets;
-} __attribute__ ((packed));
+	__u64	packets;
+};
 
 struct gnet_stats_basic_cpu {
 	struct gnet_stats_basic_packed bstats;
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 36888f5e09eb..fe33e2a9841e 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -123,8 +123,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
 	for_each_possible_cpu(i) {
 		struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
 		unsigned int start;
-		u64 bytes;
-		u32 packets;
+		u64 bytes, packets;
 
 		do {
 			start = u64_stats_fetch_begin_irq(&bcpu->syncp);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 97639b259cd7..9813ca4006dd 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -35,7 +35,7 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a,
 	 * Example if this was the 3rd packet and the string was "hello"
 	 * then it would look like "hello_3" (without quotes)
 	 */
-	pr_info("simple: %s_%d\n",
+	pr_info("simple: %s_%llu\n",
 	       (char *)d->tcfd_defdata, d->tcf_bstats.packets);
 	spin_unlock(&d->tcf_lock);
 	return d->tcf_action;
-- 
cgit v1.2.3-59-g8ed1b


From b33e699fe43aa63f29113311f69357e119ef5276 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 4 Nov 2019 19:13:15 -0800
Subject: net_sched: add TCA_STATS_PKT64 attribute

Now the kernel uses 64bit packet counters in scheduler layer,
we want to export these counters to user space.

Instead risking breaking user space by adding fields
to struct gnet_stats_basic, add a new TCA_STATS_PKT64.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/gen_stats.h | 1 +
 net/core/gen_stats.c           | 9 +++++++--
 net/sched/act_api.c            | 2 ++
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h
index 4eaacdf452e3..852f234f1fd6 100644
--- a/include/uapi/linux/gen_stats.h
+++ b/include/uapi/linux/gen_stats.h
@@ -13,6 +13,7 @@ enum {
 	TCA_STATS_RATE_EST64,
 	TCA_STATS_PAD,
 	TCA_STATS_BASIC_HW,
+	TCA_STATS_PKT64,
 	__TCA_STATS_MAX,
 };
 #define TCA_STATS_MAX (__TCA_STATS_MAX - 1)
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index fe33e2a9841e..1d653fbfcf52 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -175,12 +175,17 @@ ___gnet_stats_copy_basic(const seqcount_t *running,
 
 	if (d->tail) {
 		struct gnet_stats_basic sb;
+		int res;
 
 		memset(&sb, 0, sizeof(sb));
 		sb.bytes = bstats.bytes;
 		sb.packets = bstats.packets;
-		return gnet_stats_copy(d, type, &sb, sizeof(sb),
-				       TCA_STATS_PAD);
+		res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD);
+		if (res < 0 || sb.packets == bstats.packets)
+			return res;
+		/* emit 64bit stats only if needed */
+		return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets,
+				       sizeof(bstats.packets), TCA_STATS_PAD);
 	}
 	return 0;
 }
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 6284c552e943..bda1ba25c59e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -188,6 +188,8 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
 		+ nla_total_size(0) /* TCA_ACT_STATS nested */
 		/* TCA_STATS_BASIC */
 		+ nla_total_size_64bit(sizeof(struct gnet_stats_basic))
+		/* TCA_STATS_PKT64 */
+		+ nla_total_size_64bit(sizeof(u64))
 		/* TCA_STATS_QUEUE */
 		+ nla_total_size_64bit(sizeof(struct gnet_stats_queue))
 		+ nla_total_size(0) /* TCA_OPTIONS nested */
-- 
cgit v1.2.3-59-g8ed1b


From 25c7a6d1f90e208ec27ca854b1381ed39842ec57 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2019 14:11:51 -0800
Subject: net: avoid potential false sharing in neighbor related code

There are common instances of the following construct :

	if (n->confirmed != now)
		n->confirmed = now;

A C compiler could legally remove the conditional.

Use READ_ONCE()/WRITE_ONCE() to avoid this problem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/arp.h   |  4 ++--
 include/net/ndisc.h |  8 ++++----
 include/net/sock.h  | 12 ++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/arp.h b/include/net/arp.h
index c8f580a0e6b1..4950191f6b2b 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -57,8 +57,8 @@ static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
 		unsigned long now = jiffies;
 
 		/* avoid dirtying neighbour */
-		if (n->confirmed != now)
-			n->confirmed = now;
+		if (READ_ONCE(n->confirmed) != now)
+			WRITE_ONCE(n->confirmed, now);
 	}
 	rcu_read_unlock_bh();
 }
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index b2f715ca0567..b5ebeb3b0de0 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -414,8 +414,8 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev,
 		unsigned long now = jiffies;
 
 		/* avoid dirtying neighbour */
-		if (n->confirmed != now)
-			n->confirmed = now;
+		if (READ_ONCE(n->confirmed) != now)
+			WRITE_ONCE(n->confirmed, now);
 	}
 	rcu_read_unlock_bh();
 }
@@ -431,8 +431,8 @@ static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
 		unsigned long now = jiffies;
 
 		/* avoid dirtying neighbour */
-		if (n->confirmed != now)
-			n->confirmed = now;
+		if (READ_ONCE(n->confirmed) != now)
+			WRITE_ONCE(n->confirmed, now);
 	}
 	rcu_read_unlock_bh();
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index ac6042d0af32..f2f853439b65 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1939,8 +1939,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 
 static inline void sk_dst_confirm(struct sock *sk)
 {
-	if (!sk->sk_dst_pending_confirm)
-		sk->sk_dst_pending_confirm = 1;
+	if (!READ_ONCE(sk->sk_dst_pending_confirm))
+		WRITE_ONCE(sk->sk_dst_pending_confirm, 1);
 }
 
 static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
@@ -1950,10 +1950,10 @@ static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
 		unsigned long now = jiffies;
 
 		/* avoid dirtying neighbour */
-		if (n->confirmed != now)
-			n->confirmed = now;
-		if (sk && sk->sk_dst_pending_confirm)
-			sk->sk_dst_pending_confirm = 0;
+		if (READ_ONCE(n->confirmed) != now)
+			WRITE_ONCE(n->confirmed, now);
+		if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
+			WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 288efe8606b62d0753ba6722b36ef241877251fd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2019 14:11:53 -0800
Subject: net: annotate lockless accesses to sk->sk_ack_backlog

sk->sk_ack_backlog can be read without any lock being held.
We need to use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing
and/or potential KCSAN warnings.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h  | 6 +++---
 net/ipv4/tcp.c      | 2 +-
 net/ipv4/tcp_diag.c | 2 +-
 net/ipv4/tcp_ipv4.c | 2 +-
 net/ipv6/tcp_ipv6.c | 2 +-
 net/sched/em_meta.c | 2 +-
 net/sctp/diag.c     | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index f2f853439b65..a126784aa7d9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -859,17 +859,17 @@ static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
 
 static inline void sk_acceptq_removed(struct sock *sk)
 {
-	sk->sk_ack_backlog--;
+	WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
 }
 
 static inline void sk_acceptq_added(struct sock *sk)
 {
-	sk->sk_ack_backlog++;
+	WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
 }
 
 static inline bool sk_acceptq_is_full(const struct sock *sk)
 {
-	return sk->sk_ack_backlog > sk->sk_max_ack_backlog;
+	return READ_ONCE(sk->sk_ack_backlog) > sk->sk_max_ack_backlog;
 }
 
 /*
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1dd25189d83f..68375f7ffdce 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3225,7 +3225,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		 * tcpi_unacked -> Number of children ready for accept()
 		 * tcpi_sacked  -> max backlog
 		 */
-		info->tcpi_unacked = sk->sk_ack_backlog;
+		info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
 		info->tcpi_sacked = sk->sk_max_ack_backlog;
 		return;
 	}
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 549506162dde..edfbab54c46f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -21,7 +21,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 	struct tcp_info *info = _info;
 
 	if (inet_sk_state_load(sk) == TCP_LISTEN) {
-		r->idiag_rqueue = sk->sk_ack_backlog;
+		r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
 		r->idiag_wqueue = sk->sk_max_ack_backlog;
 	} else if (sk->sk_type == SOCK_STREAM) {
 		const struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 899e100a68e6..92282f98dc82 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2451,7 +2451,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 
 	state = inet_sk_state_load(sk);
 	if (state == TCP_LISTEN)
-		rx_queue = sk->sk_ack_backlog;
+		rx_queue = READ_ONCE(sk->sk_ack_backlog);
 	else
 		/* Because we don't lock the socket,
 		 * we might find a transient negative value.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4804b6dc5e65..81f51335e326 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1891,7 +1891,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 
 	state = inet_sk_state_load(sp);
 	if (state == TCP_LISTEN)
-		rx_queue = sp->sk_ack_backlog;
+		rx_queue = READ_ONCE(sp->sk_ack_backlog);
 	else
 		/* Because we don't lock the socket,
 		 * we might find a transient negative value.
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 3177dcb17316..ebb6e2430861 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -521,7 +521,7 @@ META_COLLECTOR(int_sk_ack_bl)
 		*err = -1;
 		return;
 	}
-	dst->value = sk->sk_ack_backlog;
+	dst->value = READ_ONCE(sk->sk_ack_backlog);
 }
 
 META_COLLECTOR(int_sk_max_ack_bl)
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index 0851166b9175..f873f15407de 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -425,7 +425,7 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 		r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc);
 		r->idiag_wqueue = infox->asoc->sndbuf_used;
 	} else {
-		r->idiag_rqueue = sk->sk_ack_backlog;
+		r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
 		r->idiag_wqueue = sk->sk_max_ack_backlog;
 	}
 	if (infox->sctpinfo)
-- 
cgit v1.2.3-59-g8ed1b


From 099ecf59f05b5f30f42ebac0ab8cb94f9b18c90c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Nov 2019 14:11:54 -0800
Subject: net: annotate lockless accesses to sk->sk_max_ack_backlog

sk->sk_max_ack_backlog can be read without any lock being held
at least in TCP/DCCP cases.

We need to use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing
and/or potential KCSAN warnings.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h              | 2 +-
 net/dccp/proto.c                | 2 +-
 net/ipv4/af_inet.c              | 2 +-
 net/ipv4/inet_connection_sock.c | 2 +-
 net/ipv4/tcp.c                  | 2 +-
 net/ipv4/tcp_diag.c             | 2 +-
 net/sched/em_meta.c             | 2 +-
 net/sctp/diag.c                 | 2 +-
 net/sctp/socket.c               | 4 ++--
 9 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index a126784aa7d9..d4d3ef5ba049 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -869,7 +869,7 @@ static inline void sk_acceptq_added(struct sock *sk)
 
 static inline bool sk_acceptq_is_full(const struct sock *sk)
 {
-	return READ_ONCE(sk->sk_ack_backlog) > sk->sk_max_ack_backlog;
+	return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
 }
 
 /*
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 5bad08dc4316..a52e8ba1ced0 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -944,7 +944,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
 	if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
 		goto out;
 
-	sk->sk_max_ack_backlog = backlog;
+	WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
 	/* Really, if the socket is already in listen state
 	 * we can only allow the backlog to be adjusted.
 	 */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 70f92aaca411..53de8e00990e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -208,7 +208,7 @@ int inet_listen(struct socket *sock, int backlog)
 	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
 		goto out;
 
-	sk->sk_max_ack_backlog = backlog;
+	WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
 	/* Really, if the socket is already in listen state
 	 * we can only allow the backlog to be adjusted.
 	 */
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index eb30fc1770de..e4c6e8b40490 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -716,7 +716,7 @@ static void reqsk_timer_handler(struct timer_list *t)
 	 * ones are about to clog our table.
 	 */
 	qlen = reqsk_queue_len(queue);
-	if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
+	if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
 		int young = reqsk_queue_len_young(queue) << 1;
 
 		while (thresh > 2) {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 68375f7ffdce..fb1666440e10 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3226,7 +3226,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		 * tcpi_sacked  -> max backlog
 		 */
 		info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
-		info->tcpi_sacked = sk->sk_max_ack_backlog;
+		info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
 		return;
 	}
 
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index edfbab54c46f..0d08f9e2d8d0 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -22,7 +22,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 
 	if (inet_sk_state_load(sk) == TCP_LISTEN) {
 		r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
-		r->idiag_wqueue = sk->sk_max_ack_backlog;
+		r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog);
 	} else if (sk->sk_type == SOCK_STREAM) {
 		const struct tcp_sock *tp = tcp_sk(sk);
 
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index ebb6e2430861..d99966a55c84 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -532,7 +532,7 @@ META_COLLECTOR(int_sk_max_ack_bl)
 		*err = -1;
 		return;
 	}
-	dst->value = sk->sk_max_ack_backlog;
+	dst->value = READ_ONCE(sk->sk_max_ack_backlog);
 }
 
 META_COLLECTOR(int_sk_prio)
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index f873f15407de..8a15146faaeb 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -426,7 +426,7 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 		r->idiag_wqueue = infox->asoc->sndbuf_used;
 	} else {
 		r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
-		r->idiag_wqueue = sk->sk_max_ack_backlog;
+		r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog);
 	}
 	if (infox->sctpinfo)
 		sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ffd3262b7a41..53abb97e0061 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -8376,7 +8376,7 @@ static int sctp_listen_start(struct sock *sk, int backlog)
 		}
 	}
 
-	sk->sk_max_ack_backlog = backlog;
+	WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
 	return sctp_hash_endpoint(ep);
 }
 
@@ -8430,7 +8430,7 @@ int sctp_inet_listen(struct socket *sock, int backlog)
 
 	/* If we are already listening, just update the backlog */
 	if (sctp_sstate(sk, LISTENING))
-		sk->sk_max_ack_backlog = backlog;
+		WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
 	else {
 		err = sctp_listen_start(sk, backlog);
 		if (err)
-- 
cgit v1.2.3-59-g8ed1b


From 4ece477870774698e6e73d5821a3dd1605ca123b Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 6 Nov 2019 17:01:05 +0800
Subject: lwtunnel: add options setting and dumping for geneve

To add options setting and dumping, .build_state(), .fill_encap() and
.get_encap_size() in ip_tun_lwt_ops needs to be extended:

ip_tun_build_state():
  ip_tun_parse_opts():
    ip_tun_parse_opts_geneve()

ip_tun_fill_encap_info():
  ip_tun_fill_encap_opts():
    ip_tun_fill_encap_opts_geneve()

ip_tun_encap_nlsize()
   ip_tun_opts_nlsize():
     if (tun_flags & TUNNEL_GENEVE_OPT)

ip_tun_parse_opts(), ip_tun_fill_encap_opts() and ip_tun_opts_nlsize()
processes LWTUNNEL_IP_OPTS.

ip_tun_parse_opts_geneve(), ip_tun_fill_encap_opts_geneve() and
if (tun_flags & TUNNEL_GENEVE_OPT) processes LWTUNNEL_IP_OPTS_GENEVE.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h |  20 ++++
 net/ipv4/ip_tunnel_core.c     | 212 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 216 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index de696ca12f2c..b595ab219036 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -27,6 +27,7 @@ enum lwtunnel_ip_t {
 	LWTUNNEL_IP_TOS,
 	LWTUNNEL_IP_FLAGS,
 	LWTUNNEL_IP_PAD,
+	LWTUNNEL_IP_OPTS,
 	__LWTUNNEL_IP_MAX,
 };
 
@@ -41,11 +42,30 @@ enum lwtunnel_ip6_t {
 	LWTUNNEL_IP6_TC,
 	LWTUNNEL_IP6_FLAGS,
 	LWTUNNEL_IP6_PAD,
+	LWTUNNEL_IP6_OPTS,
 	__LWTUNNEL_IP6_MAX,
 };
 
 #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
 
+enum {
+	LWTUNNEL_IP_OPTS_UNSPEC,
+	LWTUNNEL_IP_OPTS_GENEVE,
+	__LWTUNNEL_IP_OPTS_MAX,
+};
+
+#define LWTUNNEL_IP_OPTS_MAX (__LWTUNNEL_IP_OPTS_MAX - 1)
+
+enum {
+	LWTUNNEL_IP_OPT_GENEVE_UNSPEC,
+	LWTUNNEL_IP_OPT_GENEVE_CLASS,
+	LWTUNNEL_IP_OPT_GENEVE_TYPE,
+	LWTUNNEL_IP_OPT_GENEVE_DATA,
+	__LWTUNNEL_IP_OPT_GENEVE_MAX,
+};
+
+#define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1)
+
 enum {
 	LWT_BPF_PROG_UNSPEC,
 	LWT_BPF_PROG_FD,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index c0b5bad8e12a..1ec9d9419c34 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -34,6 +34,7 @@
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
 #include <net/dst_metadata.h>
+#include <net/geneve.h>
 
 const struct ip_tunnel_encap_ops __rcu *
 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
@@ -218,24 +219,112 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 	[LWTUNNEL_IP_TTL]	= { .type = NLA_U8 },
 	[LWTUNNEL_IP_TOS]	= { .type = NLA_U8 },
 	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
+	[LWTUNNEL_IP_OPTS]	= { .type = NLA_NESTED },
 };
 
+static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
+	[LWTUNNEL_IP_OPTS_GENEVE]	= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
+	[LWTUNNEL_IP_OPT_GENEVE_CLASS]	= { .type = NLA_U16 },
+	[LWTUNNEL_IP_OPT_GENEVE_TYPE]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_OPT_GENEVE_DATA]	= { .type = NLA_BINARY, .len = 128 },
+};
+
+static int ip_tun_parse_opts_geneve(struct nlattr *attr,
+				    struct ip_tunnel_info *info,
+				    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
+	int data_len, err;
+
+	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_OPT_GENEVE_MAX,
+					  attr, geneve_opt_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
+	    !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
+	    !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
+		return -EINVAL;
+
+	attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
+	data_len = nla_len(attr);
+	if (data_len % 4)
+		return -EINVAL;
+
+	if (info) {
+		struct geneve_opt *opt = ip_tunnel_info_opts(info);
+
+		memcpy(opt->opt_data, nla_data(attr), data_len);
+		opt->length = data_len / 4;
+		attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
+		opt->opt_class = nla_get_be16(attr);
+		attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
+		opt->type = nla_get_u8(attr);
+		info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+	}
+
+	return sizeof(struct geneve_opt) + data_len;
+}
+
+static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+			     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWTUNNEL_IP_OPTS_MAX + 1];
+	int err;
+
+	if (!attr)
+		return 0;
+
+	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_OPTS_MAX, attr,
+					  ip_opts_policy, extack);
+	if (err)
+		return err;
+
+	if (tb[LWTUNNEL_IP_OPTS_GENEVE])
+		err = ip_tun_parse_opts_geneve(tb[LWTUNNEL_IP_OPTS_GENEVE],
+					       info, extack);
+	else
+		err = -EINVAL;
+
+	return err;
+}
+
+static int ip_tun_get_optlen(struct nlattr *attr,
+			     struct netlink_ext_ack *extack)
+{
+	return ip_tun_parse_opts(attr, NULL, extack);
+}
+
+static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+			   struct netlink_ext_ack *extack)
+{
+	return ip_tun_parse_opts(attr, info, extack);
+}
+
 static int ip_tun_build_state(struct nlattr *attr,
 			      unsigned int family, const void *cfg,
 			      struct lwtunnel_state **ts,
 			      struct netlink_ext_ack *extack)
 {
-	struct ip_tunnel_info *tun_info;
-	struct lwtunnel_state *new_state;
 	struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
-	int err;
+	struct lwtunnel_state *new_state;
+	struct ip_tunnel_info *tun_info;
+	int err, opt_len;
 
 	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
 					  ip_tun_policy, extack);
 	if (err < 0)
 		return err;
 
-	new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
+	if (opt_len < 0)
+		return opt_len;
+
+	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
 	if (!new_state)
 		return -ENOMEM;
 
@@ -243,6 +332,12 @@ static int ip_tun_build_state(struct nlattr *attr,
 
 	tun_info = lwt_tun_info(new_state);
 
+	err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
+	if (err < 0) {
+		lwtstate_free(new_state);
+		return err;
+	}
+
 #ifdef CONFIG_DST_CACHE
 	err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
 	if (err) {
@@ -267,10 +362,10 @@ static int ip_tun_build_state(struct nlattr *attr,
 		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
 
 	if (tb[LWTUNNEL_IP_FLAGS])
-		tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]);
+		tun_info->key.tun_flags |= nla_get_be16(tb[LWTUNNEL_IP_FLAGS]);
 
 	tun_info->mode = IP_TUNNEL_INFO_TX;
-	tun_info->options_len = 0;
+	tun_info->options_len = opt_len;
 
 	*ts = new_state;
 
@@ -286,6 +381,54 @@ static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
 #endif
 }
 
+static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
+					 struct ip_tunnel_info *tun_info)
+{
+	struct geneve_opt *opt;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
+	if (!nest)
+		return -ENOMEM;
+
+	opt = ip_tunnel_info_opts(tun_info);
+	if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS, opt->opt_class) ||
+	    nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
+	    nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
+		    opt->opt_data)) {
+		nla_nest_cancel(skb, nest);
+		return -ENOMEM;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+}
+
+static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
+				  struct ip_tunnel_info *tun_info)
+{
+	struct nlattr *nest;
+	int err = 0;
+
+	if (!(tun_info->key.tun_flags & TUNNEL_GENEVE_OPT))
+		return 0;
+
+	nest = nla_nest_start_noflag(skb, type);
+	if (!nest)
+		return -ENOMEM;
+
+	if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT)
+		err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
+
+	if (err) {
+		nla_nest_cancel(skb, nest);
+		return err;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+}
+
 static int ip_tun_fill_encap_info(struct sk_buff *skb,
 				  struct lwtunnel_state *lwtstate)
 {
@@ -297,12 +440,34 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
 	    nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
 	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
 	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
-	    nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
+	    nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) ||
+	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
 		return -ENOMEM;
 
 	return 0;
 }
 
+static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
+{
+	int opt_len;
+
+	if (!(info->key.tun_flags & TUNNEL_GENEVE_OPT))
+		return 0;
+
+	opt_len = nla_total_size(0);		/* LWTUNNEL_IP_OPTS */
+	if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+		struct geneve_opt *opt = ip_tunnel_info_opts(info);
+
+		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_GENEVE */
+			   + nla_total_size(2)	/* OPT_GENEVE_CLASS */
+			   + nla_total_size(1)	/* OPT_GENEVE_TYPE */
+			   + nla_total_size(opt->length * 4);
+						/* OPT_GENEVE_DATA */
+	}
+
+	return opt_len;
+}
+
 static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
 {
 	return nla_total_size_64bit(8)	/* LWTUNNEL_IP_ID */
@@ -310,7 +475,9 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
 		+ nla_total_size(4)	/* LWTUNNEL_IP_SRC */
 		+ nla_total_size(1)	/* LWTUNNEL_IP_TOS */
 		+ nla_total_size(1)	/* LWTUNNEL_IP_TTL */
-		+ nla_total_size(2);	/* LWTUNNEL_IP_FLAGS */
+		+ nla_total_size(2)	/* LWTUNNEL_IP_FLAGS */
+		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+					/* LWTUNNEL_IP_OPTS */
 }
 
 static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
@@ -348,17 +515,21 @@ static int ip6_tun_build_state(struct nlattr *attr,
 			       struct lwtunnel_state **ts,
 			       struct netlink_ext_ack *extack)
 {
-	struct ip_tunnel_info *tun_info;
-	struct lwtunnel_state *new_state;
 	struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
-	int err;
+	struct lwtunnel_state *new_state;
+	struct ip_tunnel_info *tun_info;
+	int err, opt_len;
 
 	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
 					  ip6_tun_policy, extack);
 	if (err < 0)
 		return err;
 
-	new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
+	if (opt_len < 0)
+		return opt_len;
+
+	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
 	if (!new_state)
 		return -ENOMEM;
 
@@ -366,6 +537,12 @@ static int ip6_tun_build_state(struct nlattr *attr,
 
 	tun_info = lwt_tun_info(new_state);
 
+	err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
+	if (err < 0) {
+		lwtstate_free(new_state);
+		return err;
+	}
+
 	if (tb[LWTUNNEL_IP6_ID])
 		tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
 
@@ -382,10 +559,10 @@ static int ip6_tun_build_state(struct nlattr *attr,
 		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
 
 	if (tb[LWTUNNEL_IP6_FLAGS])
-		tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
+		tun_info->key.tun_flags |= nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
 
 	tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
-	tun_info->options_len = 0;
+	tun_info->options_len = opt_len;
 
 	*ts = new_state;
 
@@ -403,7 +580,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
 	    nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
 	    nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
 	    nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
-	    nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
+	    nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) ||
+	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
 		return -ENOMEM;
 
 	return 0;
@@ -416,7 +594,9 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
 		+ nla_total_size(16)	/* LWTUNNEL_IP6_SRC */
 		+ nla_total_size(1)	/* LWTUNNEL_IP6_HOPLIMIT */
 		+ nla_total_size(1)	/* LWTUNNEL_IP6_TC */
-		+ nla_total_size(2);	/* LWTUNNEL_IP6_FLAGS */
+		+ nla_total_size(2)	/* LWTUNNEL_IP6_FLAGS */
+		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+					/* LWTUNNEL_IP6_OPTS */
 }
 
 static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
-- 
cgit v1.2.3-59-g8ed1b


From edf31cbb1502481da181a09148adb33e12599185 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 6 Nov 2019 17:01:06 +0800
Subject: lwtunnel: add options setting and dumping for vxlan

Based on the code framework built on the last patch, to
support setting and dumping for vxlan, we only need to
add ip_tun_parse_opts_vxlan() for .build_state and
ip_tun_fill_encap_opts_vxlan() for .fill_encap and
if (tun_flags & TUNNEL_VXLAN_OPT) for .get_encap_size.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h |  9 ++++++
 net/ipv4/ip_tunnel_core.c     | 67 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 74 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index b595ab219036..638b7b108453 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -51,6 +51,7 @@ enum lwtunnel_ip6_t {
 enum {
 	LWTUNNEL_IP_OPTS_UNSPEC,
 	LWTUNNEL_IP_OPTS_GENEVE,
+	LWTUNNEL_IP_OPTS_VXLAN,
 	__LWTUNNEL_IP_OPTS_MAX,
 };
 
@@ -66,6 +67,14 @@ enum {
 
 #define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1)
 
+enum {
+	LWTUNNEL_IP_OPT_VXLAN_UNSPEC,
+	LWTUNNEL_IP_OPT_VXLAN_GBP,
+	__LWTUNNEL_IP_OPT_VXLAN_MAX,
+};
+
+#define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1)
+
 enum {
 	LWT_BPF_PROG_UNSPEC,
 	LWT_BPF_PROG_FD,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 1ec9d9419c34..61be2e0cbb19 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -35,6 +35,7 @@
 #include <net/rtnetlink.h>
 #include <net/dst_metadata.h>
 #include <net/geneve.h>
+#include <net/vxlan.h>
 
 const struct ip_tunnel_encap_ops __rcu *
 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
@@ -224,6 +225,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 
 static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
 	[LWTUNNEL_IP_OPTS_GENEVE]	= { .type = NLA_NESTED },
+	[LWTUNNEL_IP_OPTS_VXLAN]	= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy
@@ -233,6 +235,11 @@ geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
 	[LWTUNNEL_IP_OPT_GENEVE_DATA]	= { .type = NLA_BINARY, .len = 128 },
 };
 
+static const struct nla_policy
+vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
+	[LWTUNNEL_IP_OPT_VXLAN_GBP]	= { .type = NLA_U32 },
+};
+
 static int ip_tun_parse_opts_geneve(struct nlattr *attr,
 				    struct ip_tunnel_info *info,
 				    struct netlink_ext_ack *extack)
@@ -270,6 +277,32 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr,
 	return sizeof(struct geneve_opt) + data_len;
 }
 
+static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
+				   struct ip_tunnel_info *info,
+				   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
+	int err;
+
+	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_OPT_VXLAN_MAX,
+					  attr, vxlan_opt_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
+		return -EINVAL;
+
+	if (info) {
+		struct vxlan_metadata *md = ip_tunnel_info_opts(info);
+
+		attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
+		md->gbp = nla_get_u32(attr);
+		info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+	}
+
+	return sizeof(struct vxlan_metadata);
+}
+
 static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 			     struct netlink_ext_ack *extack)
 {
@@ -287,6 +320,9 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 	if (tb[LWTUNNEL_IP_OPTS_GENEVE])
 		err = ip_tun_parse_opts_geneve(tb[LWTUNNEL_IP_OPTS_GENEVE],
 					       info, extack);
+	else if (tb[LWTUNNEL_IP_OPTS_VXLAN])
+		err = ip_tun_parse_opts_vxlan(tb[LWTUNNEL_IP_OPTS_VXLAN],
+					      info, extack);
 	else
 		err = -EINVAL;
 
@@ -404,13 +440,34 @@ static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
 	return 0;
 }
 
+static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
+					struct ip_tunnel_info *tun_info)
+{
+	struct vxlan_metadata *md;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
+	if (!nest)
+		return -ENOMEM;
+
+	md = ip_tunnel_info_opts(tun_info);
+	if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
+		nla_nest_cancel(skb, nest);
+		return -ENOMEM;
+	}
+
+	nla_nest_end(skb, nest);
+	return 0;
+}
+
 static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
 				  struct ip_tunnel_info *tun_info)
 {
 	struct nlattr *nest;
 	int err = 0;
 
-	if (!(tun_info->key.tun_flags & TUNNEL_GENEVE_OPT))
+	if (!(tun_info->key.tun_flags &
+	      (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)))
 		return 0;
 
 	nest = nla_nest_start_noflag(skb, type);
@@ -419,6 +476,8 @@ static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
 
 	if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT)
 		err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
+	else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT)
+		err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
 
 	if (err) {
 		nla_nest_cancel(skb, nest);
@@ -451,7 +510,8 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
 {
 	int opt_len;
 
-	if (!(info->key.tun_flags & TUNNEL_GENEVE_OPT))
+	if (!(info->key.tun_flags &
+	      (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)))
 		return 0;
 
 	opt_len = nla_total_size(0);		/* LWTUNNEL_IP_OPTS */
@@ -463,6 +523,9 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
 			   + nla_total_size(1)	/* OPT_GENEVE_TYPE */
 			   + nla_total_size(opt->length * 4);
 						/* OPT_GENEVE_DATA */
+	} else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_VXLAN */
+			   + nla_total_size(4);	/* OPT_VXLAN_GBP */
 	}
 
 	return opt_len;
-- 
cgit v1.2.3-59-g8ed1b


From b0a21810bd5e1f92e3379899cc8ca9fe144ee8b3 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 6 Nov 2019 17:01:07 +0800
Subject: lwtunnel: add options setting and dumping for erspan

Based on the code framework built on the last patch, to
support setting and dumping for vxlan, we only need to
add ip_tun_parse_opts_erspan() for .build_state and
ip_tun_fill_encap_opts_erspan() for .fill_encap and
if (tun_flags & TUNNEL_ERSPAN_OPT) for .get_encap_size.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/lwtunnel.h | 12 ++++++
 net/ipv4/ip_tunnel_core.c     | 94 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 104 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 638b7b108453..f6035f737193 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -52,6 +52,7 @@ enum {
 	LWTUNNEL_IP_OPTS_UNSPEC,
 	LWTUNNEL_IP_OPTS_GENEVE,
 	LWTUNNEL_IP_OPTS_VXLAN,
+	LWTUNNEL_IP_OPTS_ERSPAN,
 	__LWTUNNEL_IP_OPTS_MAX,
 };
 
@@ -75,6 +76,17 @@ enum {
 
 #define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1)
 
+enum {
+	LWTUNNEL_IP_OPT_ERSPAN_UNSPEC,
+	LWTUNNEL_IP_OPT_ERSPAN_VER,
+	LWTUNNEL_IP_OPT_ERSPAN_INDEX,
+	LWTUNNEL_IP_OPT_ERSPAN_DIR,
+	LWTUNNEL_IP_OPT_ERSPAN_HWID,
+	__LWTUNNEL_IP_OPT_ERSPAN_MAX,
+};
+
+#define LWTUNNEL_IP_OPT_ERSPAN_MAX (__LWTUNNEL_IP_OPT_ERSPAN_MAX - 1)
+
 enum {
 	LWT_BPF_PROG_UNSPEC,
 	LWT_BPF_PROG_FD,
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 61be2e0cbb19..d4f84bf9289a 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -36,6 +36,7 @@
 #include <net/dst_metadata.h>
 #include <net/geneve.h>
 #include <net/vxlan.h>
+#include <net/erspan.h>
 
 const struct ip_tunnel_encap_ops __rcu *
 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
@@ -226,6 +227,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
 	[LWTUNNEL_IP_OPTS_GENEVE]	= { .type = NLA_NESTED },
 	[LWTUNNEL_IP_OPTS_VXLAN]	= { .type = NLA_NESTED },
+	[LWTUNNEL_IP_OPTS_ERSPAN]	= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy
@@ -240,6 +242,14 @@ vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
 	[LWTUNNEL_IP_OPT_VXLAN_GBP]	= { .type = NLA_U32 },
 };
 
+static const struct nla_policy
+erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
+	[LWTUNNEL_IP_OPT_ERSPAN_VER]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_OPT_ERSPAN_INDEX]	= { .type = NLA_U32 },
+	[LWTUNNEL_IP_OPT_ERSPAN_DIR]	= { .type = NLA_U8 },
+	[LWTUNNEL_IP_OPT_ERSPAN_HWID]	= { .type = NLA_U8 },
+};
+
 static int ip_tun_parse_opts_geneve(struct nlattr *attr,
 				    struct ip_tunnel_info *info,
 				    struct netlink_ext_ack *extack)
@@ -303,6 +313,46 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
 	return sizeof(struct vxlan_metadata);
 }
 
+static int ip_tun_parse_opts_erspan(struct nlattr *attr,
+				    struct ip_tunnel_info *info,
+				    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
+	int err;
+
+	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX,
+					  attr, erspan_opt_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
+		return -EINVAL;
+
+	if (info) {
+		struct erspan_metadata *md = ip_tunnel_info_opts(info);
+
+		attr = tb[LWTUNNEL_IP_OPT_ERSPAN_VER];
+		md->version = nla_get_u8(attr);
+
+		if (md->version == 1 && tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX]) {
+			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
+			md->u.index = nla_get_be32(attr);
+		} else if (md->version == 2 && tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] &&
+			   tb[LWTUNNEL_IP_OPT_ERSPAN_HWID]) {
+			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
+			md->u.md2.dir = nla_get_u8(attr);
+			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
+			set_hwid(&md->u.md2, nla_get_u8(attr));
+		} else {
+			return -EINVAL;
+		}
+
+		info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+	}
+
+	return sizeof(struct erspan_metadata);
+}
+
 static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 			     struct netlink_ext_ack *extack)
 {
@@ -323,6 +373,9 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 	else if (tb[LWTUNNEL_IP_OPTS_VXLAN])
 		err = ip_tun_parse_opts_vxlan(tb[LWTUNNEL_IP_OPTS_VXLAN],
 					      info, extack);
+	else if (tb[LWTUNNEL_IP_OPTS_ERSPAN])
+		err = ip_tun_parse_opts_erspan(tb[LWTUNNEL_IP_OPTS_ERSPAN],
+					       info, extack);
 	else
 		err = -EINVAL;
 
@@ -460,6 +513,37 @@ static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
 	return 0;
 }
 
+static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
+					 struct ip_tunnel_info *tun_info)
+{
+	struct erspan_metadata *md;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
+	if (!nest)
+		return -ENOMEM;
+
+	md = ip_tunnel_info_opts(tun_info);
+	if (nla_put_u32(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
+		goto err;
+
+	if (md->version == 1 &&
+	    nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
+		goto err;
+
+	if (md->version == 2 &&
+	    (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
+	     nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
+			get_hwid(&md->u.md2))))
+		goto err;
+
+	nla_nest_end(skb, nest);
+	return 0;
+err:
+	nla_nest_cancel(skb, nest);
+	return -ENOMEM;
+}
+
 static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
 				  struct ip_tunnel_info *tun_info)
 {
@@ -467,7 +551,7 @@ static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
 	int err = 0;
 
 	if (!(tun_info->key.tun_flags &
-	      (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)))
+	      (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT)))
 		return 0;
 
 	nest = nla_nest_start_noflag(skb, type);
@@ -478,6 +562,8 @@ static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
 		err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
 	else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT)
 		err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
+	else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)
+		err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
 
 	if (err) {
 		nla_nest_cancel(skb, nest);
@@ -511,7 +597,7 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
 	int opt_len;
 
 	if (!(info->key.tun_flags &
-	      (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)))
+	      (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT)))
 		return 0;
 
 	opt_len = nla_total_size(0);		/* LWTUNNEL_IP_OPTS */
@@ -526,6 +612,10 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
 	} else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
 		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_VXLAN */
 			   + nla_total_size(4);	/* OPT_VXLAN_GBP */
+	} else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_ERSPAN */
+			   + nla_total_size(1)	/* OPT_ERSPAN_VER */
+			   + nla_total_size(4);	/* OPT_ERSPAN_INDEX/DIR/HWID */
 	}
 
 	return opt_len;
-- 
cgit v1.2.3-59-g8ed1b


From 9ed498c6280a2f2b51d02df96df53037272ede49 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Nov 2019 10:04:11 -0800
Subject: net: silence data-races on sk_backlog.tail

sk->sk_backlog.tail might be read without holding the socket spinlock,
we need to add proper READ_ONCE()/WRITE_ONCE() to silence the warnings.

KCSAN reported :

BUG: KCSAN: data-race in tcp_add_backlog / tcp_recvmsg

write to 0xffff8881265109f8 of 8 bytes by interrupt on cpu 1:
 __sk_add_backlog include/net/sock.h:907 [inline]
 sk_add_backlog include/net/sock.h:938 [inline]
 tcp_add_backlog+0x476/0xce0 net/ipv4/tcp_ipv4.c:1759
 tcp_v4_rcv+0x1a70/0x1bd0 net/ipv4/tcp_ipv4.c:1947
 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204
 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252
 dst_input include/net/dst.h:442 [inline]
 ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413
 NF_HOOK include/linux/netfilter.h:305 [inline]
 NF_HOOK include/linux/netfilter.h:299 [inline]
 ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523
 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:4929
 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5043
 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5133
 napi_skb_finish net/core/dev.c:5596 [inline]
 napi_gro_receive+0x28f/0x330 net/core/dev.c:5629
 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061
 virtnet_receive drivers/net/virtio_net.c:1323 [inline]
 virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428
 napi_poll net/core/dev.c:6311 [inline]
 net_rx_action+0x3ae/0xa90 net/core/dev.c:6379
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 invoke_softirq kernel/softirq.c:373 [inline]
 irq_exit+0xbb/0xe0 kernel/softirq.c:413
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 do_IRQ+0xa6/0x180 arch/x86/kernel/irq.c:263
 ret_from_intr+0x0/0x19
 native_safe_halt+0xe/0x10 arch/x86/kernel/paravirt.c:71
 arch_cpu_idle+0x1f/0x30 arch/x86/kernel/process.c:571
 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94
 cpuidle_idle_call kernel/sched/idle.c:154 [inline]
 do_idle+0x1af/0x280 kernel/sched/idle.c:263
 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355
 start_secondary+0x208/0x260 arch/x86/kernel/smpboot.c:264
 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241

read to 0xffff8881265109f8 of 8 bytes by task 8057 on cpu 0:
 tcp_recvmsg+0x46e/0x1b40 net/ipv4/tcp.c:2050
 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838
 sock_recvmsg_nosec net/socket.c:871 [inline]
 sock_recvmsg net/socket.c:889 [inline]
 sock_recvmsg+0x92/0xb0 net/socket.c:885
 sock_read_iter+0x15f/0x1e0 net/socket.c:967
 call_read_iter include/linux/fs.h:1889 [inline]
 new_sync_read+0x389/0x4f0 fs/read_write.c:414
 __vfs_read+0xb1/0xc0 fs/read_write.c:427
 vfs_read fs/read_write.c:461 [inline]
 vfs_read+0x143/0x2c0 fs/read_write.c:446
 ksys_read+0xd5/0x1b0 fs/read_write.c:587
 __do_sys_read fs/read_write.c:597 [inline]
 __se_sys_read fs/read_write.c:595 [inline]
 __x64_sys_read+0x4c/0x60 fs/read_write.c:595
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 8057 Comm: syz-fuzzer Not tainted 5.4.0-rc6+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/crypto/chelsio/chtls/chtls_io.c | 10 +++++-----
 include/net/sock.h                      |  4 ++--
 net/ipv4/tcp.c                          |  2 +-
 net/llc/af_llc.c                        |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index 98bc5a4cd5e7..599dec59c6cc 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -1437,7 +1437,7 @@ static int chtls_pt_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 				      csk->wr_max_credits))
 			sk->sk_write_space(sk);
 
-		if (copied >= target && !sk->sk_backlog.tail)
+		if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
 			break;
 
 		if (copied) {
@@ -1470,7 +1470,7 @@ static int chtls_pt_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 				break;
 			}
 		}
-		if (sk->sk_backlog.tail) {
+		if (READ_ONCE(sk->sk_backlog.tail)) {
 			release_sock(sk);
 			lock_sock(sk);
 			chtls_cleanup_rbuf(sk, copied);
@@ -1615,7 +1615,7 @@ static int peekmsg(struct sock *sk, struct msghdr *msg,
 			break;
 		}
 
-		if (sk->sk_backlog.tail) {
+		if (READ_ONCE(sk->sk_backlog.tail)) {
 			/* Do not sleep, just process backlog. */
 			release_sock(sk);
 			lock_sock(sk);
@@ -1743,7 +1743,7 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 				      csk->wr_max_credits))
 			sk->sk_write_space(sk);
 
-		if (copied >= target && !sk->sk_backlog.tail)
+		if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
 			break;
 
 		if (copied) {
@@ -1774,7 +1774,7 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			}
 		}
 
-		if (sk->sk_backlog.tail) {
+		if (READ_ONCE(sk->sk_backlog.tail)) {
 			release_sock(sk);
 			lock_sock(sk);
 			chtls_cleanup_rbuf(sk, copied);
diff --git a/include/net/sock.h b/include/net/sock.h
index d4d3ef5ba049..bd210c78dc9d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -899,11 +899,11 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 	skb_dst_force(skb);
 
 	if (!sk->sk_backlog.tail)
-		sk->sk_backlog.head = skb;
+		WRITE_ONCE(sk->sk_backlog.head, skb);
 	else
 		sk->sk_backlog.tail->next = skb;
 
-	sk->sk_backlog.tail = skb;
+	WRITE_ONCE(sk->sk_backlog.tail, skb);
 	skb->next = NULL;
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fb1666440e10..8fb4fefcfd54 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2047,7 +2047,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 
 		/* Well, if we have backlog, try to process it now yet. */
 
-		if (copied >= target && !sk->sk_backlog.tail)
+		if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
 			break;
 
 		if (copied) {
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 50d2c9749db3..2922d4150d88 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -780,7 +780,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		}
 		/* Well, if we have backlog, try to process it now yet. */
 
-		if (copied >= target && !sk->sk_backlog.tail)
+		if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
 			break;
 
 		if (copied) {
-- 
cgit v1.2.3-59-g8ed1b


From 2c63221cd9e5c0dad0424029aeb1c40faada8330 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Wed, 6 Nov 2019 23:36:13 +0100
Subject: dt-bindings: net: phy: Add support for AT803X

Document the Atheros AR803x PHY bindings.

Signed-off-by: Michael Walle <michael@walle.cc>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/qca,ar803x.yaml        | 111 +++++++++++++++++++++
 MAINTAINERS                                        |   2 +
 include/dt-bindings/net/qca-ar803x.h               |  13 +++
 3 files changed, 126 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/qca,ar803x.yaml
 create mode 100644 include/dt-bindings/net/qca-ar803x.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/net/qca,ar803x.yaml b/Documentation/devicetree/bindings/net/qca,ar803x.yaml
new file mode 100644
index 000000000000..5a6c9d20c0ba
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/qca,ar803x.yaml
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: GPL-2.0+
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/qca,ar803x.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Atheros AR803x PHY
+
+maintainers:
+  - Andrew Lunn <andrew@lunn.ch>
+  - Florian Fainelli <f.fainelli@gmail.com>
+  - Heiner Kallweit <hkallweit1@gmail.com>
+
+description: |
+  Bindings for Qualcomm Atheros AR803x PHYs
+
+allOf:
+  - $ref: ethernet-phy.yaml#
+
+properties:
+  qca,clk-out-frequency:
+    description: Clock output frequency in Hertz.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - enum: [ 25000000, 50000000, 62500000, 125000000 ]
+
+  qca,clk-out-strength:
+    description: Clock output driver strength.
+    allOf:
+      - $ref: /schemas/types.yaml#/definitions/uint32
+      - enum: [ 0, 1, 2 ]
+
+  qca,keep-pll-enabled:
+    description: |
+      If set, keep the PLL enabled even if there is no link. Useful if you
+      want to use the clock output without an ethernet link.
+
+      Only supported on the AR8031.
+    type: boolean
+
+  vddio-supply:
+    description: |
+      RGMII I/O voltage regulator (see regulator/regulator.yaml).
+
+      The PHY supports RGMII I/O voltages of 1.5V, 1.8V and 2.5V. You can
+      either connect this to the vddio-regulator (1.5V / 1.8V) or the
+      vddh-regulator (2.5V).
+
+      Only supported on the AR8031.
+
+  vddio-regulator:
+    type: object
+    description:
+      Initial data for the VDDIO regulator. Set this to 1.5V or 1.8V.
+    allOf:
+      - $ref: /schemas/regulator/regulator.yaml
+
+  vddh-regulator:
+    type: object
+    description:
+      Dummy subnode to model the external connection of the PHY VDDH
+      regulator to VDDIO.
+    allOf:
+      - $ref: /schemas/regulator/regulator.yaml
+
+
+examples:
+  - |
+    #include <dt-bindings/net/qca-ar803x.h>
+
+    ethernet {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        phy-mode = "rgmii-id";
+
+        ethernet-phy@0 {
+            reg = <0>;
+
+            qca,clk-out-frequency = <125000000>;
+            qca,clk-out-strength = <AR803X_STRENGTH_FULL>;
+
+            vddio-supply = <&vddio>;
+
+            vddio: vddio-regulator {
+                regulator-min-microvolt = <1800000>;
+                regulator-max-microvolt = <1800000>;
+            };
+        };
+    };
+  - |
+    #include <dt-bindings/net/qca-ar803x.h>
+
+    ethernet {
+        #address-cells = <1>;
+        #size-cells = <0>;
+
+        phy-mode = "rgmii-id";
+
+        ethernet-phy@0 {
+            reg = <0>;
+
+            qca,clk-out-frequency = <50000000>;
+            qca,keep-pll-enabled;
+
+            vddio-supply = <&vddh>;
+
+            vddh: vddh-regulator {
+            };
+        };
+    };
diff --git a/MAINTAINERS b/MAINTAINERS
index c0024b296158..709c60aacb58 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6155,10 +6155,12 @@ S:	Maintained
 F:	Documentation/ABI/testing/sysfs-class-net-phydev
 F:	Documentation/devicetree/bindings/net/ethernet-phy.yaml
 F:	Documentation/devicetree/bindings/net/mdio*
+F:	Documentation/devicetree/bindings/net/qca,ar803x.yaml
 F:	Documentation/networking/phy.rst
 F:	drivers/net/phy/
 F:	drivers/of/of_mdio.c
 F:	drivers/of/of_net.c
+F:	include/dt-bindings/net/qca-ar803x.h
 F:	include/linux/*mdio*.h
 F:	include/linux/of_net.h
 F:	include/linux/phy.h
diff --git a/include/dt-bindings/net/qca-ar803x.h b/include/dt-bindings/net/qca-ar803x.h
new file mode 100644
index 000000000000..9c046c7242ed
--- /dev/null
+++ b/include/dt-bindings/net/qca-ar803x.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Device Tree constants for the Qualcomm Atheros AR803x PHYs
+ */
+
+#ifndef _DT_BINDINGS_QCA_AR803X_H
+#define _DT_BINDINGS_QCA_AR803X_H
+
+#define AR803X_STRENGTH_FULL		0
+#define AR803X_STRENGTH_HALF		1
+#define AR803X_STRENGTH_QUARTER		2
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 200ecef67b8d09d16ec55f91c92751dcc7a38d40 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Nov 2019 11:51:18 -0800
Subject: tcp: Remove one extra ktime_get_ns() from cookie_init_timestamp

tcp_make_synack() already uses tcp_clock_ns(), and can pass
the value to cookie_init_timestamp() to avoid another call
to ktime_get_ns() helper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 12 +++++++++---
 net/ipv4/syncookies.c |  4 ++--
 net/ipv4/tcp_output.c |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ab4eb5eb5d07..36f195fb576a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -537,7 +537,7 @@ static inline u32 tcp_cookie_time(void)
 u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
 			      u16 *mssp);
 __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
-u64 cookie_init_timestamp(struct request_sock *req);
+u64 cookie_init_timestamp(struct request_sock *req, u64 now);
 bool cookie_timestamp_decode(const struct net *net,
 			     struct tcp_options_received *opt);
 bool cookie_ecn_ok(const struct tcp_options_received *opt,
@@ -757,10 +757,16 @@ static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
 	return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
 }
 
+/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
+static inline u32 tcp_ns_to_ts(u64 ns)
+{
+	return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
+}
+
 /* Could use tcp_clock_us() / 1000, but this version uses a single divide */
 static inline u32 tcp_time_stamp_raw(void)
 {
-	return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
+	return tcp_ns_to_ts(tcp_clock_ns());
 }
 
 void tcp_mstamp_refresh(struct tcp_sock *tp);
@@ -772,7 +778,7 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
 
 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
 {
-	return div_u64(skb->skb_mstamp_ns, NSEC_PER_SEC / TCP_TS_HZ);
+	return tcp_ns_to_ts(skb->skb_mstamp_ns);
 }
 
 /* provide the departure time in us unit */
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 535b69326f66..345b2b0ff618 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -62,10 +62,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
  * Since subsequent timestamps use the normal tcp_time_stamp value, we
  * must make sure that the resulting initial timestamp is <= tcp_time_stamp.
  */
-u64 cookie_init_timestamp(struct request_sock *req)
+u64 cookie_init_timestamp(struct request_sock *req, u64 now)
 {
 	struct inet_request_sock *ireq;
-	u32 ts, ts_now = tcp_time_stamp_raw();
+	u32 ts, ts_now = tcp_ns_to_ts(now);
 	u32 options = 0;
 
 	ireq = inet_rsk(req);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0488607c5cd3..be6d22b8190f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3290,7 +3290,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	now = tcp_clock_ns();
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(req->cookie_ts))
-		skb->skb_mstamp_ns = cookie_init_timestamp(req);
+		skb->skb_mstamp_ns = cookie_init_timestamp(req, now);
 	else
 #endif
 	{
-- 
cgit v1.2.3-59-g8ed1b


From 6896cc4d8fe6fe6163d6f0baa02a270da68896e8 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amitc@mellanox.com>
Date: Thu, 7 Nov 2019 18:42:09 +0200
Subject: devlink: Add layer 3 generic packet traps

Add packet traps that can report packets that were dropped during layer
3 forwarding.

Signed-off-by: Amit Cohen <amitc@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink-trap.rst | 41 +++++++++++++++++++++++++++++++
 include/net/devlink.h                     | 27 ++++++++++++++++++++
 net/core/devlink.c                        |  9 +++++++
 3 files changed, 77 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink-trap.rst b/Documentation/networking/devlink-trap.rst
index 8e90a85f3bd5..dc3dc87217c9 100644
--- a/Documentation/networking/devlink-trap.rst
+++ b/Documentation/networking/devlink-trap.rst
@@ -162,6 +162,47 @@ be added to the following table:
      - ``drop``
      - Traps packets that the device decided to drop because they could not be
        enqueued to a transmission queue which is full
+   * - ``non_ip``
+     - ``drop``
+     - Traps packets that the device decided to drop because they need to
+       undergo a layer 3 lookup, but are not IP or MPLS packets
+   * - ``uc_dip_over_mc_dmac``
+     - ``drop``
+     - Traps packets that the device decided to drop because they need to be
+       routed and they have a unicast destination IP and a multicast destination
+       MAC
+   * - ``dip_is_loopback_address``
+     - ``drop``
+     - Traps packets that the device decided to drop because they need to be
+       routed and their destination IP is the loopback address (i.e., 127.0.0.0/8
+       and ::1/128)
+   * - ``sip_is_mc``
+     - ``drop``
+     - Traps packets that the device decided to drop because they need to be
+       routed and their source IP is multicast (i.e., 224.0.0.0/8 and ff::/8)
+   * - ``sip_is_loopback_address``
+     - ``drop``
+     - Traps packets that the device decided to drop because they need to be
+       routed and their source IP is the loopback address (i.e., 127.0.0.0/8 and ::1/128)
+   * - ``ip_header_corrupted``
+     - ``drop``
+     - Traps packets that the device decided to drop because they need to be
+       routed and their IP header is corrupted: wrong checksum, wrong IP version
+       or too short Internet Header Length (IHL)
+   * - ``ipv4_sip_is_limited_bc``
+     - ``drop``
+     - Traps packets that the device decided to drop because they need to be
+       routed and their source IP is limited broadcast (i.e., 255.255.255.255/32)
+   * - ``ipv6_mc_dip_reserved_scope``
+     - ``drop``
+     - Traps IPv6 packets that the device decided to drop because they need to
+       be routed and their IPv6 multicast destination IP has a reserved scope
+       (i.e., ffx0::/16)
+   * - ``ipv6_mc_dip_interface_local_scope``
+     - ``drop``
+     - Traps IPv6 packets that the device decided to drop because they need to
+       be routed and their IPv6 multicast destination IP has an interface-local scope
+       (i.e., ffx1::/16)
 
 Driver-specific Packet Traps
 ============================
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 6bf3b9e0595a..df7814d55bf9 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -569,6 +569,15 @@ enum devlink_trap_generic_id {
 	DEVLINK_TRAP_GENERIC_ID_BLACKHOLE_ROUTE,
 	DEVLINK_TRAP_GENERIC_ID_TTL_ERROR,
 	DEVLINK_TRAP_GENERIC_ID_TAIL_DROP,
+	DEVLINK_TRAP_GENERIC_ID_NON_IP_PACKET,
+	DEVLINK_TRAP_GENERIC_ID_UC_DIP_MC_DMAC,
+	DEVLINK_TRAP_GENERIC_ID_DIP_LB,
+	DEVLINK_TRAP_GENERIC_ID_SIP_MC,
+	DEVLINK_TRAP_GENERIC_ID_SIP_LB,
+	DEVLINK_TRAP_GENERIC_ID_CORRUPTED_IP_HDR,
+	DEVLINK_TRAP_GENERIC_ID_IPV4_SIP_BC,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_RESERVED_SCOPE,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE,
 
 	/* Add new generic trap IDs above */
 	__DEVLINK_TRAP_GENERIC_ID_MAX,
@@ -607,6 +616,24 @@ enum devlink_trap_group_generic_id {
 	"ttl_value_is_too_small"
 #define DEVLINK_TRAP_GENERIC_NAME_TAIL_DROP \
 	"tail_drop"
+#define DEVLINK_TRAP_GENERIC_NAME_NON_IP_PACKET \
+	"non_ip"
+#define DEVLINK_TRAP_GENERIC_NAME_UC_DIP_MC_DMAC \
+	"uc_dip_over_mc_dmac"
+#define DEVLINK_TRAP_GENERIC_NAME_DIP_LB \
+	"dip_is_loopback_address"
+#define DEVLINK_TRAP_GENERIC_NAME_SIP_MC \
+	"sip_is_mc"
+#define DEVLINK_TRAP_GENERIC_NAME_SIP_LB \
+	"sip_is_loopback_address"
+#define DEVLINK_TRAP_GENERIC_NAME_CORRUPTED_IP_HDR \
+	"ip_header_corrupted"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV4_SIP_BC \
+	"ipv4_sip_is_limited_bc"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_RESERVED_SCOPE \
+	"ipv6_mc_dip_reserved_scope"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE \
+	"ipv6_mc_dip_interface_local_scope"
 
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \
 	"l2_drops"
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 97e9a2246929..9bbe2162f22f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -7602,6 +7602,15 @@ static const struct devlink_trap devlink_trap_generic[] = {
 	DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP),
 	DEVLINK_TRAP(TTL_ERROR, EXCEPTION),
 	DEVLINK_TRAP(TAIL_DROP, DROP),
+	DEVLINK_TRAP(NON_IP_PACKET, DROP),
+	DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP),
+	DEVLINK_TRAP(DIP_LB, DROP),
+	DEVLINK_TRAP(SIP_MC, DROP),
+	DEVLINK_TRAP(SIP_LB, DROP),
+	DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP),
+	DEVLINK_TRAP(IPV4_SIP_BC, DROP),
+	DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP),
+	DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP),
 };
 
 #define DEVLINK_TRAP_GROUP(_id)						      \
-- 
cgit v1.2.3-59-g8ed1b


From 3b063ae57bdfec5e574ace440e6c3f34c4115a92 Mon Sep 17 00:00:00 2001
From: Amit Cohen <amitc@mellanox.com>
Date: Thu, 7 Nov 2019 18:42:14 +0200
Subject: devlink: Add layer 3 generic packet exception traps

Add layer 3 generic packet exception traps that can report trapped
packets and documentation of the traps.

Unlike drop traps, these exception traps also need to inject the packet
to the kernel's receive path. For example, a packet that was trapped due
to unreachable neighbour need to be injected into the kernel so that it
will trigger an ARP request or a neighbour solicitation message.

Signed-off-by: Amit Cohen <amitc@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink-trap.rst | 20 ++++++++++++++++++++
 include/net/devlink.h                     | 18 ++++++++++++++++++
 net/core/devlink.c                        |  6 ++++++
 3 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink-trap.rst b/Documentation/networking/devlink-trap.rst
index dc3dc87217c9..dc9659ca06fa 100644
--- a/Documentation/networking/devlink-trap.rst
+++ b/Documentation/networking/devlink-trap.rst
@@ -203,6 +203,26 @@ be added to the following table:
      - Traps IPv6 packets that the device decided to drop because they need to
        be routed and their IPv6 multicast destination IP has an interface-local scope
        (i.e., ffx1::/16)
+   * - ``mtu_value_is_too_small``
+     - ``exception``
+     - Traps packets that should have been routed by the device, but were bigger
+       than the MTU of the egress interface
+   * - ``unresolved_neigh``
+     - ``exception``
+     - Traps packets that did not have a matching IP neighbour after routing
+   * - ``mc_reverse_path_forwarding``
+     - ``exception``
+     - Traps multicast IP packets that failed reverse-path forwarding (RPF)
+       check during multicast routing
+   * - ``reject_route``
+     - ``exception``
+     - Traps packets that hit reject routes (i.e., "unreachable", "prohibit")
+   * - ``ipv4_lpm_miss``
+     - ``exception``
+     - Traps unicast IPv4 packets that did not match any route
+   * - ``ipv6_lpm_miss``
+     - ``exception``
+     - Traps unicast IPv6 packets that did not match any route
 
 Driver-specific Packet Traps
 ============================
diff --git a/include/net/devlink.h b/include/net/devlink.h
index df7814d55bf9..8d6b5846822c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -578,6 +578,12 @@ enum devlink_trap_generic_id {
 	DEVLINK_TRAP_GENERIC_ID_IPV4_SIP_BC,
 	DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_RESERVED_SCOPE,
 	DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE,
+	DEVLINK_TRAP_GENERIC_ID_MTU_ERROR,
+	DEVLINK_TRAP_GENERIC_ID_UNRESOLVED_NEIGH,
+	DEVLINK_TRAP_GENERIC_ID_RPF,
+	DEVLINK_TRAP_GENERIC_ID_REJECT_ROUTE,
+	DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS,
+	DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS,
 
 	/* Add new generic trap IDs above */
 	__DEVLINK_TRAP_GENERIC_ID_MAX,
@@ -634,6 +640,18 @@ enum devlink_trap_group_generic_id {
 	"ipv6_mc_dip_reserved_scope"
 #define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE \
 	"ipv6_mc_dip_interface_local_scope"
+#define DEVLINK_TRAP_GENERIC_NAME_MTU_ERROR \
+	"mtu_value_is_too_small"
+#define DEVLINK_TRAP_GENERIC_NAME_UNRESOLVED_NEIGH \
+	"unresolved_neigh"
+#define DEVLINK_TRAP_GENERIC_NAME_RPF \
+	"mc_reverse_path_forwarding"
+#define DEVLINK_TRAP_GENERIC_NAME_REJECT_ROUTE \
+	"reject_route"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV4_LPM_UNICAST_MISS \
+	"ipv4_lpm_miss"
+#define DEVLINK_TRAP_GENERIC_NAME_IPV6_LPM_UNICAST_MISS \
+	"ipv6_lpm_miss"
 
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \
 	"l2_drops"
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 9bbe2162f22f..ff53f7d29dea 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -7611,6 +7611,12 @@ static const struct devlink_trap devlink_trap_generic[] = {
 	DEVLINK_TRAP(IPV4_SIP_BC, DROP),
 	DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP),
 	DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP),
+	DEVLINK_TRAP(MTU_ERROR, EXCEPTION),
+	DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION),
+	DEVLINK_TRAP(RPF, EXCEPTION),
+	DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION),
+	DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION),
+	DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION),
 };
 
 #define DEVLINK_TRAP_GROUP(_id)						      \
-- 
cgit v1.2.3-59-g8ed1b


From de7d5084d82794a8e83afb994fcb07f82da3cd7b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Nov 2019 16:27:14 -0800
Subject: net: provide dev_lstats_read() helper

Many network drivers use hand-coded implementation of the same thing,
let's factorize things so that u64_stats_t adoption is done once.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/loopback.c    | 24 +++++++++++++++++-------
 include/linux/netdevice.h |  2 ++
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 14545a8797a8..92336ac4c5e6 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -99,13 +99,13 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
-static void loopback_get_stats64(struct net_device *dev,
-				 struct rtnl_link_stats64 *stats)
+void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes)
 {
-	u64 bytes = 0;
-	u64 packets = 0;
 	int i;
 
+	*packets = 0;
+	*bytes = 0;
+
 	for_each_possible_cpu(i) {
 		const struct pcpu_lstats *lb_stats;
 		u64 tbytes, tpackets;
@@ -114,12 +114,22 @@ static void loopback_get_stats64(struct net_device *dev,
 		lb_stats = per_cpu_ptr(dev->lstats, i);
 		do {
 			start = u64_stats_fetch_begin_irq(&lb_stats->syncp);
-			tbytes = lb_stats->bytes;
 			tpackets = lb_stats->packets;
+			tbytes = lb_stats->bytes;
 		} while (u64_stats_fetch_retry_irq(&lb_stats->syncp, start));
-		bytes   += tbytes;
-		packets += tpackets;
+		*bytes   += tbytes;
+		*packets += tpackets;
 	}
+}
+EXPORT_SYMBOL(dev_lstats_read);
+
+static void loopback_get_stats64(struct net_device *dev,
+				 struct rtnl_link_stats64 *stats)
+{
+	u64 packets, bytes;
+
+	dev_lstats_read(dev, &packets, &bytes);
+
 	stats->rx_packets = packets;
 	stats->tx_packets = packets;
 	stats->rx_bytes   = bytes;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1f140a6b66df..75561992c31f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2401,6 +2401,8 @@ struct pcpu_lstats {
 	struct u64_stats_sync syncp;
 } __aligned(2 * sizeof(u64));
 
+void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);
+
 #define __netdev_alloc_pcpu_stats(type, gfp)				\
 ({									\
 	typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
-- 
cgit v1.2.3-59-g8ed1b


From dd5382a08157756510aa8d7269c662eccde775cb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Nov 2019 16:27:15 -0800
Subject: net: provide dev_lstats_add() helper

Many network drivers need it and hand-coded the same function.

In order to ease u64_stats_t adoption, it is time to factorize.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/loopback.c    | 12 ++----------
 include/linux/netdevice.h | 10 ++++++++++
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 92336ac4c5e6..47ad2478b9f3 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -68,7 +68,6 @@ EXPORT_SYMBOL(blackhole_netdev);
 static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 				 struct net_device *dev)
 {
-	struct pcpu_lstats *lb_stats;
 	int len;
 
 	skb_tx_timestamp(skb);
@@ -85,16 +84,9 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 
 	skb->protocol = eth_type_trans(skb, dev);
 
-	/* it's OK to use per_cpu_ptr() because BHs are off */
-	lb_stats = this_cpu_ptr(dev->lstats);
-
 	len = skb->len;
-	if (likely(netif_rx(skb) == NET_RX_SUCCESS)) {
-		u64_stats_update_begin(&lb_stats->syncp);
-		lb_stats->bytes += len;
-		lb_stats->packets++;
-		u64_stats_update_end(&lb_stats->syncp);
-	}
+	if (likely(netif_rx(skb) == NET_RX_SUCCESS))
+		dev_lstats_add(dev, len);
 
 	return NETDEV_TX_OK;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 75561992c31f..461a36220cf4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2403,6 +2403,16 @@ struct pcpu_lstats {
 
 void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);
 
+static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
+{
+	struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);
+
+	u64_stats_update_begin(&lstats->syncp);
+	lstats->bytes += len;
+	lstats->packets++;
+	u64_stats_update_end(&lstats->syncp);
+}
+
 #define __netdev_alloc_pcpu_stats(type, gfp)				\
 ({									\
 	typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
-- 
cgit v1.2.3-59-g8ed1b


From 316580b69d0a7aeeee5063af47438b626bc47cbd Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Nov 2019 16:27:20 -0800
Subject: u64_stats: provide u64_stats_t type

On 64bit arches, struct u64_stats_sync is empty and provides
no help against load/store tearing.

Using READ_ONCE()/WRITE_ONCE() would be needed.

But the update side would be slightly more expensive.

local64_t was defined so that we could use regular adds
in a manner which is atomic wrt IRQs.

However the u64_stats infra means we do not have to use
local64_t on 32bit arches since the syncp provides the needed
protection.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 51 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 47 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index a27604f99ed0..9de5c10293f5 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -40,8 +40,8 @@
  *   spin_lock_bh(...) or other synchronization to get exclusive access
  *   ...
  *   u64_stats_update_begin(&stats->syncp);
- *   stats->bytes64 += len; // non atomic operation
- *   stats->packets64++;    // non atomic operation
+ *   u64_stats_add(&stats->bytes64, len); // non atomic operation
+ *   u64_stats_inc(&stats->packets64);    // non atomic operation
  *   u64_stats_update_end(&stats->syncp);
  *
  * While a consumer (reader) should use following template to get consistent
@@ -52,8 +52,8 @@
  *
  * do {
  *         start = u64_stats_fetch_begin(&stats->syncp);
- *         tbytes = stats->bytes64; // non atomic operation
- *         tpackets = stats->packets64; // non atomic operation
+ *         tbytes = u64_stats_read(&stats->bytes64); // non atomic operation
+ *         tpackets = u64_stats_read(&stats->packets64); // non atomic operation
  * } while (u64_stats_fetch_retry(&stats->syncp, start));
  *
  *
@@ -68,6 +68,49 @@ struct u64_stats_sync {
 #endif
 };
 
+#if BITS_PER_LONG == 64
+#include <asm/local64.h>
+
+typedef struct {
+	local64_t	v;
+} u64_stats_t ;
+
+static inline u64 u64_stats_read(const u64_stats_t *p)
+{
+	return local64_read(&p->v);
+}
+
+static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
+{
+	local64_add(val, &p->v);
+}
+
+static inline void u64_stats_inc(u64_stats_t *p)
+{
+	local64_inc(&p->v);
+}
+
+#else
+
+typedef struct {
+	u64		v;
+} u64_stats_t;
+
+static inline u64 u64_stats_read(const u64_stats_t *p)
+{
+	return p->v;
+}
+
+static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
+{
+	p->v += val;
+}
+
+static inline void u64_stats_inc(u64_stats_t *p)
+{
+	p->v++;
+}
+#endif
 
 static inline void u64_stats_init(struct u64_stats_sync *syncp)
 {
-- 
cgit v1.2.3-59-g8ed1b


From fd2f4737870eb866537fbbffa2b59414b9b0c0a2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Nov 2019 16:27:22 -0800
Subject: net: use u64_stats_t in struct pcpu_lstats

In order to fix the data-race found by KCSAN, we
can use the new u64_stats_t type and its accessors instead
of plain u64 fields. This will still generate optimal code
for both 32 and 64 bit platforms.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/loopback.c    | 4 ++--
 include/linux/netdevice.h | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 47ad2478b9f3..a1c77cc00416 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -106,8 +106,8 @@ void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes)
 		lb_stats = per_cpu_ptr(dev->lstats, i);
 		do {
 			start = u64_stats_fetch_begin_irq(&lb_stats->syncp);
-			tpackets = lb_stats->packets;
-			tbytes = lb_stats->bytes;
+			tpackets = u64_stats_read(&lb_stats->packets);
+			tbytes = u64_stats_read(&lb_stats->bytes);
 		} while (u64_stats_fetch_retry_irq(&lb_stats->syncp, start));
 		*bytes   += tbytes;
 		*packets += tpackets;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 461a36220cf4..f857f01234f7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2396,8 +2396,8 @@ struct pcpu_sw_netstats {
 } __aligned(4 * sizeof(u64));
 
 struct pcpu_lstats {
-	u64 packets;
-	u64 bytes;
+	u64_stats_t packets;
+	u64_stats_t bytes;
 	struct u64_stats_sync syncp;
 } __aligned(2 * sizeof(u64));
 
@@ -2408,8 +2408,8 @@ static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
 	struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);
 
 	u64_stats_update_begin(&lstats->syncp);
-	lstats->bytes += len;
-	lstats->packets++;
+	u64_stats_add(&lstats->bytes, len);
+	u64_stats_inc(&lstats->packets);
 	u64_stats_update_end(&lstats->syncp);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c305c6ae79e2ce20c22660ceda94f0d86d639a82 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Nov 2019 18:29:11 -0800
Subject: net: add annotations on hh->hh_len lockless accesses

KCSAN reported a data-race [1]

While we can use READ_ONCE() on the read sides,
we need to make sure hh->hh_len is written last.

[1]

BUG: KCSAN: data-race in eth_header_cache / neigh_resolve_output

write to 0xffff8880b9dedcb8 of 4 bytes by task 29760 on cpu 0:
 eth_header_cache+0xa9/0xd0 net/ethernet/eth.c:247
 neigh_hh_init net/core/neighbour.c:1463 [inline]
 neigh_resolve_output net/core/neighbour.c:1480 [inline]
 neigh_resolve_output+0x415/0x470 net/core/neighbour.c:1470
 neigh_output include/net/neighbour.h:511 [inline]
 ip6_finish_output2+0x7a2/0xec0 net/ipv6/ip6_output.c:116
 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline]
 __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127
 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152
 NF_HOOK_COND include/linux/netfilter.h:294 [inline]
 ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175
 dst_output include/net/dst.h:436 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ndisc_send_skb+0x459/0x5f0 net/ipv6/ndisc.c:505
 ndisc_send_ns+0x207/0x430 net/ipv6/ndisc.c:647
 rt6_probe_deferred+0x98/0xf0 net/ipv6/route.c:615
 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269
 worker_thread+0xa0/0x800 kernel/workqueue.c:2415
 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352

read to 0xffff8880b9dedcb8 of 4 bytes by task 29572 on cpu 1:
 neigh_resolve_output net/core/neighbour.c:1479 [inline]
 neigh_resolve_output+0x113/0x470 net/core/neighbour.c:1470
 neigh_output include/net/neighbour.h:511 [inline]
 ip6_finish_output2+0x7a2/0xec0 net/ipv6/ip6_output.c:116
 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline]
 __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127
 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152
 NF_HOOK_COND include/linux/netfilter.h:294 [inline]
 ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175
 dst_output include/net/dst.h:436 [inline]
 NF_HOOK include/linux/netfilter.h:305 [inline]
 ndisc_send_skb+0x459/0x5f0 net/ipv6/ndisc.c:505
 ndisc_send_ns+0x207/0x430 net/ipv6/ndisc.c:647
 rt6_probe_deferred+0x98/0xf0 net/ipv6/route.c:615
 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269
 worker_thread+0xa0/0x800 kernel/workqueue.c:2415
 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 29572 Comm: kworker/1:4 Not tainted 5.4.0-rc6+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events rt6_probe_deferred

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/firewire/net.c  | 6 +++++-
 include/net/neighbour.h | 2 +-
 net/core/neighbour.c    | 4 ++--
 net/ethernet/eth.c      | 7 ++++++-
 4 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index b132ab9ad607..715e491dfbc3 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -250,7 +250,11 @@ static int fwnet_header_cache(const struct neighbour *neigh,
 	h = (struct fwnet_header *)((u8 *)hh->hh_data + HH_DATA_OFF(sizeof(*h)));
 	h->h_proto = type;
 	memcpy(h->h_dest, neigh->ha, net->addr_len);
-	hh->hh_len = FWNET_HLEN;
+
+	/* Pairs with the READ_ONCE() in neigh_resolve_output(),
+	 * neigh_hh_output() and neigh_update_hhs().
+	 */
+	smp_store_release(&hh->hh_len, FWNET_HLEN);
 
 	return 0;
 }
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 50a67bd6a434..6a86e49181db 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -468,7 +468,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
 
 	do {
 		seq = read_seqbegin(&hh->hh_lock);
-		hh_len = hh->hh_len;
+		hh_len = READ_ONCE(hh->hh_len);
 		if (likely(hh_len <= HH_DATA_MOD)) {
 			hh_alen = HH_DATA_MOD;
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8c82e95f7539..652da6369037 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1197,7 +1197,7 @@ static void neigh_update_hhs(struct neighbour *neigh)
 
 	if (update) {
 		hh = &neigh->hh;
-		if (hh->hh_len) {
+		if (READ_ONCE(hh->hh_len)) {
 			write_seqlock_bh(&hh->hh_lock);
 			update(hh, neigh->dev, neigh->ha);
 			write_sequnlock_bh(&hh->hh_lock);
@@ -1476,7 +1476,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
 		struct net_device *dev = neigh->dev;
 		unsigned int seq;
 
-		if (dev->header_ops->cache && !neigh->hh.hh_len)
+		if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
 			neigh_hh_init(neigh);
 
 		do {
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 17374afee28f..9040fe55e0f5 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -244,7 +244,12 @@ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16
 	eth->h_proto = type;
 	memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
 	memcpy(eth->h_dest, neigh->ha, ETH_ALEN);
-	hh->hh_len = ETH_HLEN;
+
+	/* Pairs with READ_ONCE() in neigh_resolve_output(),
+	 * neigh_hh_output() and neigh_update_hhs().
+	 */
+	smp_store_release(&hh->hh_len, ETH_HLEN);
+
 	return 0;
 }
 EXPORT_SYMBOL(eth_header_cache);
-- 
cgit v1.2.3-59-g8ed1b


From f8cc62ca3e660ae3fdaee533b1d554297cd2ae82 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 7 Nov 2019 18:49:43 -0800
Subject: net: add a READ_ONCE() in skb_peek_tail()

skb_peek_tail() can be used without protection of a lock,
as spotted by KCSAN [1]

In order to avoid load-stearing, add a READ_ONCE()

Note that the corresponding WRITE_ONCE() are already there.

[1]
BUG: KCSAN: data-race in sk_wait_data / skb_queue_tail

read to 0xffff8880b36a4118 of 8 bytes by task 20426 on cpu 1:
 skb_peek_tail include/linux/skbuff.h:1784 [inline]
 sk_wait_data+0x15b/0x250 net/core/sock.c:2477
 kcm_wait_data+0x112/0x1f0 net/kcm/kcmsock.c:1103
 kcm_recvmsg+0xac/0x320 net/kcm/kcmsock.c:1130
 sock_recvmsg_nosec net/socket.c:871 [inline]
 sock_recvmsg net/socket.c:889 [inline]
 sock_recvmsg+0x92/0xb0 net/socket.c:885
 ___sys_recvmsg+0x1a0/0x3e0 net/socket.c:2480
 do_recvmmsg+0x19a/0x5c0 net/socket.c:2601
 __sys_recvmmsg+0x1ef/0x200 net/socket.c:2680
 __do_sys_recvmmsg net/socket.c:2703 [inline]
 __se_sys_recvmmsg net/socket.c:2696 [inline]
 __x64_sys_recvmmsg+0x89/0xb0 net/socket.c:2696
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

write to 0xffff8880b36a4118 of 8 bytes by task 451 on cpu 0:
 __skb_insert include/linux/skbuff.h:1852 [inline]
 __skb_queue_before include/linux/skbuff.h:1958 [inline]
 __skb_queue_tail include/linux/skbuff.h:1991 [inline]
 skb_queue_tail+0x7e/0xc0 net/core/skbuff.c:3145
 kcm_queue_rcv_skb+0x202/0x310 net/kcm/kcmsock.c:206
 kcm_rcv_strparser+0x74/0x4b0 net/kcm/kcmsock.c:370
 __strp_recv+0x348/0xf50 net/strparser/strparser.c:309
 strp_recv+0x84/0xa0 net/strparser/strparser.c:343
 tcp_read_sock+0x174/0x5c0 net/ipv4/tcp.c:1639
 strp_read_sock+0xd4/0x140 net/strparser/strparser.c:366
 do_strp_work net/strparser/strparser.c:414 [inline]
 strp_work+0x9a/0xe0 net/strparser/strparser.c:423
 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269
 worker_thread+0xa0/0x800 kernel/workqueue.c:2415
 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 451 Comm: kworker/u4:3 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: kstrp strp_work

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 53238ac725a3..dfe02b658829 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1795,7 +1795,7 @@ static inline struct sk_buff *skb_peek_next(struct sk_buff *skb,
  */
 static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
 {
-	struct sk_buff *skb = list_->prev;
+	struct sk_buff *skb = READ_ONCE(list_->prev);
 
 	if (skb == (struct sk_buff *)list_)
 		skb = NULL;
@@ -1861,7 +1861,9 @@ static inline void __skb_insert(struct sk_buff *newsk,
 				struct sk_buff *prev, struct sk_buff *next,
 				struct sk_buff_head *list)
 {
-	/* see skb_queue_empty_lockless() for the opposite READ_ONCE() */
+	/* See skb_queue_empty_lockless() and skb_peek_tail()
+	 * for the opposite READ_ONCE()
+	 */
 	WRITE_ONCE(newsk->next, next);
 	WRITE_ONCE(newsk->prev, prev);
 	WRITE_ONCE(next->prev, newsk);
-- 
cgit v1.2.3-59-g8ed1b


From 6912daed05e1370af5253aea6f2116805c0e57f8 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Wed, 23 Oct 2019 11:59:00 +0200
Subject: mac80211: Shrink the size of ack_frame_id to make room for
 tx_time_est
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To implement airtime queue limiting, we need to keep a running account of
the estimated airtime of all skbs queued into the device. Do to this
correctly, we need to store the airtime estimate into the skb so we can
decrease the outstanding balance when the skb is freed. This means that the
time estimate must be stored somewhere that will survive for the lifetime
of the skb.

To get this, decrease the size of the ack_frame_id field to 6 bits, and
lower the size of the ID space accordingly. This leaves 10 bits for use for
tx_time_est, which is enough to store a maximum of 4096 us, if we shift the
values so they become units of 4us.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/157182474063.150713.16132669599100802716.stgit@toke.dk
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 4 +++-
 net/mac80211/cfg.c     | 2 +-
 net/mac80211/tx.c      | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f5996960eace..c643a19dce96 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -967,6 +967,7 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate)
  * @band: the band to transmit on (use for checking for races)
  * @hw_queue: HW queue to put the frame on, skb_get_queue_mapping() gives the AC
  * @ack_frame_id: internal frame ID for TX status, used internally
+ * @tx_time_est: TX time estimate in units of 4us, used internally
  * @control: union part for control data
  * @control.rates: TX rates array to try
  * @control.rts_cts_rate_idx: rate for RTS or CTS
@@ -1007,7 +1008,8 @@ struct ieee80211_tx_info {
 
 	u8 hw_queue;
 
-	u16 ack_frame_id;
+	u16 ack_frame_id:6;
+	u16 tx_time_est:10;
 
 	union {
 		struct {
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 70739e746c13..4fb7f1f12109 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3428,7 +3428,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
 
 	spin_lock_irqsave(&local->ack_status_lock, spin_flags);
 	id = idr_alloc(&local->ack_status_frames, ack_skb,
-		       1, 0x10000, GFP_ATOMIC);
+		       1, 0x40, GFP_ATOMIC);
 	spin_unlock_irqrestore(&local->ack_status_lock, spin_flags);
 
 	if (id < 0) {
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a4c435abe15f..db38be1b75fa 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2430,7 +2430,7 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local,
 
 		spin_lock_irqsave(&local->ack_status_lock, flags);
 		id = idr_alloc(&local->ack_status_frames, ack_skb,
-			       1, 0x10000, GFP_ATOMIC);
+			       1, 0x40, GFP_ATOMIC);
 		spin_unlock_irqrestore(&local->ack_status_lock, flags);
 
 		if (id >= 0) {
-- 
cgit v1.2.3-59-g8ed1b


From 14f34e36b36ceede9877ca422a62fcac17b52023 Mon Sep 17 00:00:00 2001
From: Gurumoorthi Gnanasambandhan <gguru@codeaurora.org>
Date: Thu, 31 Oct 2019 23:46:40 +0200
Subject: cfg80211: VLAN offload support for set_key and set_sta_vlan

This provides an alternative mechanism for AP VLAN support where a
single netdev is used with VLAN tagged frames instead of separate
netdevs for each VLAN without tagged frames from the WLAN driver.

By setting NL80211_EXT_FEATURE_VLAN_OFFLOAD flag the driver indicates
support for a single netdev with VLAN tagged frames. Separate
VLAN-specific netdevs can be added using RTM_NEWLINK/IFLA_VLAN_ID
similarly to Ethernet. NL80211_CMD_NEW_KEY (for group keys),
NL80211_CMD_NEW_STATION, and NL80211_CMD_SET_STATION will optionally
specify vlan_id using NL80211_ATTR_VLAN_ID.

Signed-off-by: Gurumoorthi Gnanasambandhan <gguru@codeaurora.org>
Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Link: https://lore.kernel.org/r/20191031214640.5012-1-jouni@codeaurora.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  4 ++++
 include/uapi/linux/nl80211.h | 26 ++++++++++++++++++++++++++
 net/wireless/nl80211.c       | 11 +++++++++++
 3 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 4ab2c49423dc..e309cc826b40 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -565,6 +565,7 @@ struct vif_params {
  *	with the get_key() callback, must be in little endian,
  *	length given by @seq_len.
  * @seq_len: length of @seq.
+ * @vlan_id: vlan_id for VLAN group key (if nonzero)
  * @mode: key install mode (RX_TX, NO_TX or SET_TX)
  */
 struct key_params {
@@ -572,6 +573,7 @@ struct key_params {
 	const u8 *seq;
 	int key_len;
 	int seq_len;
+	u16 vlan_id;
 	u32 cipher;
 	enum nl80211_key_mode mode;
 };
@@ -1124,6 +1126,7 @@ struct sta_txpwr {
  *	(bitmask of BIT(%NL80211_STA_FLAG_...))
  * @listen_interval: listen interval or -1 for no change
  * @aid: AID or zero for no change
+ * @vlan_id: VLAN ID for station (if nonzero)
  * @peer_aid: mesh peer AID or zero for no change
  * @plink_action: plink action to take
  * @plink_state: set the peer link state for a station
@@ -1159,6 +1162,7 @@ struct station_parameters {
 	u32 sta_modify_mask;
 	int listen_interval;
 	u16 aid;
+	u16 vlan_id;
 	u16 peer_aid;
 	u8 supported_rates_len;
 	u8 plink_action;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 64135ab3a7ac..341e0e8cae46 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -248,6 +248,22 @@
  * %NL80211_ATTR_SAE_PASSWORD.
  */
 
+/**
+ * DOC: VLAN offload support for setting group keys and binding STAs to VLANs
+ *
+ * By setting @NL80211_EXT_FEATURE_VLAN_OFFLOAD flag drivers can indicate they
+ * support offloading VLAN functionality in a manner where the driver exposes a
+ * single netdev that uses VLAN tagged frames and separate VLAN-specific netdevs
+ * can then be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to the Ethernet
+ * case. Frames received from stations that are not assigned to any VLAN are
+ * delivered on the main netdev and frames to such stations can be sent through
+ * that main netdev.
+ *
+ * %NL80211_CMD_NEW_KEY (for group keys), %NL80211_CMD_NEW_STATION, and
+ * %NL80211_CMD_SET_STATION will optionally specify vlan_id using
+ * %NL80211_ATTR_VLAN_ID.
+ */
+
 /**
  * enum nl80211_commands - supported nl80211 commands
  *
@@ -2381,6 +2397,9 @@ enum nl80211_commands {
  *	the allowed channel bandwidth configurations. (u8 attribute)
  *	Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13.
  *
+ * @NL80211_ATTR_VLAN_ID: VLAN ID (1..4094) for the station and VLAN group key
+ *	(u16).
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2843,6 +2862,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_WIPHY_EDMG_CHANNELS,
 	NL80211_ATTR_WIPHY_EDMG_BW_CONFIG,
 
+	NL80211_ATTR_VLAN_ID,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -5492,6 +5513,10 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_SAE_OFFLOAD: Device wants to do SAE authentication in
  *	station mode (SAE password is passed as part of the connect command).
  *
+ * @NL80211_EXT_FEATURE_VLAN_OFFLOAD: The driver supports a single netdev
+ *	with VLAN tagged frames and separate VLAN-specific netdevs added using
+ *	vconfig similarly to the Ethernet case.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5537,6 +5562,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_EXT_KEY_ID,
 	NL80211_EXT_FEATURE_STA_TX_PWR,
 	NL80211_EXT_FEATURE_SAE_OFFLOAD,
+	NL80211_EXT_FEATURE_VLAN_OFFLOAD,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d1451e731bb8..50761a4102bd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -624,6 +624,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 					.len = SAE_PASSWORD_MAX_LEN },
 	[NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG },
 	[NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy),
+	[NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2),
 };
 
 /* policy for the key attributes */
@@ -3940,6 +3941,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
 	    key.type != NL80211_KEYTYPE_GROUP)
 		return -EINVAL;
 
+	if (key.type == NL80211_KEYTYPE_GROUP &&
+	    info->attrs[NL80211_ATTR_VLAN_ID])
+		key.p.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]);
+
 	if (!rdev->ops->add_key)
 		return -EOPNOTSUPP;
 
@@ -5711,6 +5716,9 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_STA_AID])
 		params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
 
+	if (info->attrs[NL80211_ATTR_VLAN_ID])
+		params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]);
+
 	if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
 		params.listen_interval =
 		     nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
@@ -5856,6 +5864,9 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 	params.listen_interval =
 		nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]);
 
+	if (info->attrs[NL80211_ATTR_VLAN_ID])
+		params.vlan_id = nla_get_u16(info->attrs[NL80211_ATTR_VLAN_ID]);
+
 	if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) {
 		params.support_p2p_ps =
 			nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]);
-- 
cgit v1.2.3-59-g8ed1b


From 90b2be27bb0e56483f335cc10fb59ec66882b949 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 Nov 2019 08:45:23 -0800
Subject: net/sched: annotate lockless accesses to qdisc->empty

KCSAN reported the following race [1]

BUG: KCSAN: data-race in __dev_queue_xmit / net_tx_action

read to 0xffff8880ba403508 of 1 bytes by task 21814 on cpu 1:
 __dev_xmit_skb net/core/dev.c:3389 [inline]
 __dev_queue_xmit+0x9db/0x1b40 net/core/dev.c:3761
 dev_queue_xmit+0x21/0x30 net/core/dev.c:3825
 neigh_hh_output include/net/neighbour.h:500 [inline]
 neigh_output include/net/neighbour.h:509 [inline]
 ip6_finish_output2+0x873/0xec0 net/ipv6/ip6_output.c:116
 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline]
 __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127
 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152
 NF_HOOK_COND include/linux/netfilter.h:294 [inline]
 ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175
 dst_output include/net/dst.h:436 [inline]
 ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179
 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795
 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173
 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471
 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576
 sock_sendmsg_nosec net/socket.c:637 [inline]
 sock_sendmsg+0x9f/0xc0 net/socket.c:657
 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311
 __sys_sendmmsg+0x123/0x350 net/socket.c:2413
 __do_sys_sendmmsg net/socket.c:2442 [inline]
 __se_sys_sendmmsg net/socket.c:2439 [inline]
 __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

write to 0xffff8880ba403508 of 1 bytes by interrupt on cpu 0:
 qdisc_run_begin include/net/sch_generic.h:160 [inline]
 qdisc_run include/net/pkt_sched.h:120 [inline]
 net_tx_action+0x2b1/0x6c0 net/core/dev.c:4551
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1082
 do_softirq.part.0+0x6b/0x80 kernel/softirq.c:337
 do_softirq kernel/softirq.c:329 [inline]
 __local_bh_enable_ip+0x76/0x80 kernel/softirq.c:189
 local_bh_enable include/linux/bottom_half.h:32 [inline]
 rcu_read_unlock_bh include/linux/rcupdate.h:688 [inline]
 ip6_finish_output2+0x7bb/0xec0 net/ipv6/ip6_output.c:117
 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline]
 __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127
 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152
 NF_HOOK_COND include/linux/netfilter.h:294 [inline]
 ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175
 dst_output include/net/dst.h:436 [inline]
 ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179
 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795
 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173
 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471
 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576
 sock_sendmsg_nosec net/socket.c:637 [inline]
 sock_sendmsg+0x9f/0xc0 net/socket.c:657
 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311
 __sys_sendmmsg+0x123/0x350 net/socket.c:2413
 __do_sys_sendmmsg net/socket.c:2442 [inline]
 __se_sys_sendmmsg net/socket.c:2439 [inline]
 __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 21817 Comm: syz-executor.2 Not tainted 5.4.0-rc6+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Fixes: d518d2ed8640 ("net/sched: fix race between deactivation and dequeue for NOLOCK qdisc")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 6 +++---
 net/core/dev.c            | 2 +-
 net/sched/sch_generic.c   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a8b0a9a4c686..d43da37737be 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -148,8 +148,8 @@ static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
 static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
 {
 	if (qdisc_is_percpu_stats(qdisc))
-		return qdisc->empty;
-	return !qdisc->q.qlen;
+		return READ_ONCE(qdisc->empty);
+	return !READ_ONCE(qdisc->q.qlen);
 }
 
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
@@ -157,7 +157,7 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 	if (qdisc->flags & TCQ_F_NOLOCK) {
 		if (!spin_trylock(&qdisc->seqlock))
 			return false;
-		qdisc->empty = false;
+		WRITE_ONCE(qdisc->empty, false);
 	} else if (qdisc_is_running(qdisc)) {
 		return false;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index bb15800c8cb5..1c799d486623 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3607,7 +3607,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	qdisc_calculate_pkt_len(skb, q);
 
 	if (q->flags & TCQ_F_NOLOCK) {
-		if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty &&
+		if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) &&
 		    qdisc_run_begin(q)) {
 			if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
 					      &q->state))) {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 8561e825f401..5ab696efca95 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -652,7 +652,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 	if (likely(skb)) {
 		qdisc_update_stats_at_dequeue(qdisc, skb);
 	} else {
-		qdisc->empty = true;
+		WRITE_ONCE(qdisc->empty, true);
 	}
 
 	return skb;
-- 
cgit v1.2.3-59-g8ed1b


From 134bdac397661a5841d9f27f508190c68b26232b Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Fri, 8 Nov 2019 12:05:10 +0700
Subject: tipc: add new AEAD key structure for user API

The new structure 'tipc_aead_key' is added to the 'tipc.h' for user to
be able to transfer a key to TIPC in kernel. Netlink will be used for
this purpose in the later commits.

Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 76421b878767..add01db1daef 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -233,6 +233,27 @@ struct tipc_sioc_nodeid_req {
 	char node_id[TIPC_NODEID_LEN];
 };
 
+/*
+ * TIPC Crypto, AEAD
+ */
+#define TIPC_AEAD_ALG_NAME		(32)
+
+struct tipc_aead_key {
+	char alg_name[TIPC_AEAD_ALG_NAME];
+	unsigned int keylen;	/* in bytes */
+	char key[];
+};
+
+#define TIPC_AEAD_KEYLEN_MIN		(16 + 4)
+#define TIPC_AEAD_KEYLEN_MAX		(32 + 4)
+#define TIPC_AEAD_KEY_SIZE_MAX		(sizeof(struct tipc_aead_key) + \
+							TIPC_AEAD_KEYLEN_MAX)
+
+static inline int tipc_aead_key_size(struct tipc_aead_key *key)
+{
+	return sizeof(*key) + key->keylen;
+}
+
 /* The macros and functions below are deprecated:
  */
 
-- 
cgit v1.2.3-59-g8ed1b


From e1f32190cf7ddd55778b460e7d44af3f76529698 Mon Sep 17 00:00:00 2001
From: Tuong Lien <tuong.t.lien@dektech.com.au>
Date: Fri, 8 Nov 2019 12:05:12 +0700
Subject: tipc: add support for AEAD key setting via netlink

This commit adds two netlink commands to TIPC in order for user to be
able to set or remove AEAD keys:
- TIPC_NL_KEY_SET
- TIPC_NL_KEY_FLUSH

When the 'KEY_SET' is given along with the key data, the key will be
initiated and attached to TIPC crypto. On the other hand, the
'KEY_FLUSH' command will remove all existing keys if any.

Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_netlink.h |   4 ++
 net/tipc/netlink.c                |  18 ++++-
 net/tipc/node.c                   | 135 ++++++++++++++++++++++++++++++++++++++
 net/tipc/node.h                   |   4 ++
 4 files changed, 160 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h
index efb958fd167d..6c2194ab745b 100644
--- a/include/uapi/linux/tipc_netlink.h
+++ b/include/uapi/linux/tipc_netlink.h
@@ -63,6 +63,8 @@ enum {
 	TIPC_NL_PEER_REMOVE,
 	TIPC_NL_BEARER_ADD,
 	TIPC_NL_UDP_GET_REMOTEIP,
+	TIPC_NL_KEY_SET,
+	TIPC_NL_KEY_FLUSH,
 
 	__TIPC_NL_CMD_MAX,
 	TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1
@@ -160,6 +162,8 @@ enum {
 	TIPC_NLA_NODE_UNSPEC,
 	TIPC_NLA_NODE_ADDR,		/* u32 */
 	TIPC_NLA_NODE_UP,		/* flag */
+	TIPC_NLA_NODE_ID,		/* data */
+	TIPC_NLA_NODE_KEY,		/* data */
 
 	__TIPC_NLA_NODE_MAX,
 	TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index d32bbd0f5e46..e53231bd23b4 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -102,7 +102,11 @@ const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
 const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
 	[TIPC_NLA_NODE_UNSPEC]		= { .type = NLA_UNSPEC },
 	[TIPC_NLA_NODE_ADDR]		= { .type = NLA_U32 },
-	[TIPC_NLA_NODE_UP]		= { .type = NLA_FLAG }
+	[TIPC_NLA_NODE_UP]		= { .type = NLA_FLAG },
+	[TIPC_NLA_NODE_ID]		= { .type = NLA_BINARY,
+					    .len = TIPC_NODEID_LEN},
+	[TIPC_NLA_NODE_KEY]		= { .type = NLA_BINARY,
+					    .len = TIPC_AEAD_KEY_SIZE_MAX},
 };
 
 /* Properties valid for media, bearer and link */
@@ -257,6 +261,18 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
 		.dumpit	= tipc_udp_nl_dump_remoteip,
 	},
 #endif
+#ifdef CONFIG_TIPC_CRYPTO
+	{
+		.cmd	= TIPC_NL_KEY_SET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= tipc_nl_node_set_key,
+	},
+	{
+		.cmd	= TIPC_NL_KEY_FLUSH,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= tipc_nl_node_flush_key,
+	},
+#endif
 };
 
 struct genl_family tipc_genl_family __ro_after_init = {
diff --git a/net/tipc/node.c b/net/tipc/node.c
index d8bf2c179562..aaf595613e6e 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -2760,6 +2760,141 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
 	return skb->len;
 }
 
+#ifdef CONFIG_TIPC_CRYPTO
+static int tipc_nl_retrieve_key(struct nlattr **attrs,
+				struct tipc_aead_key **key)
+{
+	struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY];
+
+	if (!attr)
+		return -ENODATA;
+
+	*key = (struct tipc_aead_key *)nla_data(attr);
+	if (nla_len(attr) < tipc_aead_key_size(*key))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id)
+{
+	struct nlattr *attr = attrs[TIPC_NLA_NODE_ID];
+
+	if (!attr)
+		return -ENODATA;
+
+	if (nla_len(attr) < TIPC_NODEID_LEN)
+		return -EINVAL;
+
+	*node_id = (u8 *)nla_data(attr);
+	return 0;
+}
+
+int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1];
+	struct net *net = sock_net(skb->sk);
+	struct tipc_net *tn = tipc_net(net);
+	struct tipc_node *n = NULL;
+	struct tipc_aead_key *ukey;
+	struct tipc_crypto *c;
+	u8 *id, *own_id;
+	int rc = 0;
+
+	if (!info->attrs[TIPC_NLA_NODE])
+		return -EINVAL;
+
+	rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX,
+			      info->attrs[TIPC_NLA_NODE],
+			      tipc_nl_node_policy, info->extack);
+	if (rc)
+		goto exit;
+
+	own_id = tipc_own_id(net);
+	if (!own_id) {
+		rc = -EPERM;
+		goto exit;
+	}
+
+	rc = tipc_nl_retrieve_key(attrs, &ukey);
+	if (rc)
+		goto exit;
+
+	rc = tipc_aead_key_validate(ukey);
+	if (rc)
+		goto exit;
+
+	rc = tipc_nl_retrieve_nodeid(attrs, &id);
+	switch (rc) {
+	case -ENODATA:
+		/* Cluster key mode */
+		rc = tipc_crypto_key_init(tn->crypto_tx, ukey, CLUSTER_KEY);
+		break;
+	case 0:
+		/* Per-node key mode */
+		if (!memcmp(id, own_id, NODE_ID_LEN)) {
+			c = tn->crypto_tx;
+		} else {
+			n = tipc_node_find_by_id(net, id) ?:
+				tipc_node_create(net, 0, id, 0xffffu, 0, true);
+			if (unlikely(!n)) {
+				rc = -ENOMEM;
+				break;
+			}
+			c = n->crypto_rx;
+		}
+
+		rc = tipc_crypto_key_init(c, ukey, PER_NODE_KEY);
+		if (n)
+			tipc_node_put(n);
+		break;
+	default:
+		break;
+	}
+
+exit:
+	return (rc < 0) ? rc : 0;
+}
+
+int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
+{
+	int err;
+
+	rtnl_lock();
+	err = __tipc_nl_node_set_key(skb, info);
+	rtnl_unlock();
+
+	return err;
+}
+
+int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = sock_net(skb->sk);
+	struct tipc_net *tn = tipc_net(net);
+	struct tipc_node *n;
+
+	tipc_crypto_key_flush(tn->crypto_tx);
+	rcu_read_lock();
+	list_for_each_entry_rcu(n, &tn->node_list, list)
+		tipc_crypto_key_flush(n->crypto_rx);
+	rcu_read_unlock();
+
+	pr_info("All keys are flushed!\n");
+	return 0;
+}
+
+int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info)
+{
+	int err;
+
+	rtnl_lock();
+	err = __tipc_nl_node_flush_key(skb, info);
+	rtnl_unlock();
+
+	return err;
+}
+#endif
+
 /**
  * tipc_node_dump - dump TIPC node data
  * @n: tipc node to be dumped
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 1a15cf82cb11..a6803b449a2c 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -119,5 +119,9 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info);
 int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb);
 int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb,
 				   struct netlink_callback *cb);
+#ifdef CONFIG_TIPC_CRYPTO
+int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info);
+#endif
 void tipc_node_pre_cleanup_net(struct net *exit_net);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From a0c76345e3d3dbc40c39de2e00d15a3b7eef7885 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 8 Nov 2019 21:42:43 +0100
Subject: devlink: disallow reload operation during device cleanup

There is a race between driver code that does setup/cleanup of device
and devlink reload operation that in some drivers works with the same
code. Use after free could we easily obtained by running:

while true; do
        echo 10 > /sys/bus/netdevsim/new_device
        devlink dev reload netdevsim/netdevsim10 &
        echo 10 > /sys/bus/netdevsim/del_device
done

Fix this by enabling reload only after setup of device is complete and
disabling it at the beginning of the cleanup process.

Reported-by: Ido Schimmel <idosch@mellanox.com>
Fixes: 2d8dc5bbf4e7 ("devlink: Add support for reload")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c  |  3 +++
 drivers/net/ethernet/mellanox/mlxsw/core.c |  6 ++++-
 drivers/net/netdevsim/dev.c                |  3 +++
 include/net/devlink.h                      |  7 +++--
 net/core/devlink.c                         | 42 +++++++++++++++++++++++++++++-
 5 files changed, 57 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 22c72fb7206a..77f056b0895e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -4015,6 +4015,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_params_unregister;
 
 	devlink_params_publish(devlink);
+	devlink_reload_enable(devlink);
 	pci_save_state(pdev);
 	return 0;
 
@@ -4126,6 +4127,8 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	struct devlink *devlink = priv_to_devlink(priv);
 	int active_vfs = 0;
 
+	devlink_reload_disable(devlink);
+
 	if (mlx4_is_slave(dev))
 		persist->interface_state |= MLX4_INTERFACE_STATE_NOWAIT;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index e1a90f5bddd0..da436a6aad2f 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1198,8 +1198,10 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 	if (err)
 		goto err_thermal_init;
 
-	if (mlxsw_driver->params_register)
+	if (mlxsw_driver->params_register) {
 		devlink_params_publish(devlink);
+		devlink_reload_enable(devlink);
+	}
 
 	return 0;
 
@@ -1263,6 +1265,8 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_core);
 
+	if (!reload)
+		devlink_reload_disable(devlink);
 	if (devlink_is_reload_failed(devlink)) {
 		if (!reload)
 			/* Only the parts that were not de-initialized in the
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 3da96c7e8265..059711edfc61 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -820,6 +820,7 @@ int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev)
 		goto err_bpf_dev_exit;
 
 	devlink_params_publish(devlink);
+	devlink_reload_enable(devlink);
 	return 0;
 
 err_bpf_dev_exit:
@@ -865,6 +866,8 @@ void nsim_dev_remove(struct nsim_bus_dev *nsim_bus_dev)
 	struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev);
 	struct devlink *devlink = priv_to_devlink(nsim_dev);
 
+	devlink_reload_disable(devlink);
+
 	nsim_dev_reload_destroy(nsim_dev);
 
 	nsim_bpf_dev_exit(nsim_dev);
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 8d6b5846822c..7891611868e4 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -38,8 +38,9 @@ struct devlink {
 	struct device *dev;
 	possible_net_t _net;
 	struct mutex lock;
-	bool reload_failed;
-	bool registered;
+	u8 reload_failed:1,
+	   reload_enabled:1,
+	   registered:1;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
@@ -824,6 +825,8 @@ void devlink_net_set(struct devlink *devlink, struct net *net);
 struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
 int devlink_register(struct devlink *devlink, struct device *dev);
 void devlink_unregister(struct devlink *devlink);
+void devlink_reload_enable(struct devlink *devlink);
+void devlink_reload_disable(struct devlink *devlink);
 void devlink_free(struct devlink *devlink);
 int devlink_port_register(struct devlink *devlink,
 			  struct devlink_port *devlink_port,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ff53f7d29dea..2e027c9436e0 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2791,6 +2791,9 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net,
 {
 	int err;
 
+	if (!devlink->reload_enabled)
+		return -EOPNOTSUPP;
+
 	err = devlink->ops->reload_down(devlink, !!dest_net, extack);
 	if (err)
 		return err;
@@ -6308,12 +6311,49 @@ EXPORT_SYMBOL_GPL(devlink_register);
 void devlink_unregister(struct devlink *devlink)
 {
 	mutex_lock(&devlink_mutex);
+	WARN_ON(devlink_reload_supported(devlink) &&
+		devlink->reload_enabled);
 	devlink_notify(devlink, DEVLINK_CMD_DEL);
 	list_del(&devlink->list);
 	mutex_unlock(&devlink_mutex);
 }
 EXPORT_SYMBOL_GPL(devlink_unregister);
 
+/**
+ *	devlink_reload_enable - Enable reload of devlink instance
+ *
+ *	@devlink: devlink
+ *
+ *	Should be called at end of device initialization
+ *	process when reload operation is supported.
+ */
+void devlink_reload_enable(struct devlink *devlink)
+{
+	mutex_lock(&devlink_mutex);
+	devlink->reload_enabled = true;
+	mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_reload_enable);
+
+/**
+ *	devlink_reload_disable - Disable reload of devlink instance
+ *
+ *	@devlink: devlink
+ *
+ *	Should be called at the beginning of device cleanup
+ *	process when reload operation is supported.
+ */
+void devlink_reload_disable(struct devlink *devlink)
+{
+	mutex_lock(&devlink_mutex);
+	/* Mutex is taken which ensures that no reload operation is in
+	 * progress while setting up forbidded flag.
+	 */
+	devlink->reload_enabled = false;
+	mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_reload_disable);
+
 /**
  *	devlink_free - Free devlink instance resources
  *
@@ -8201,7 +8241,7 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
 			if (WARN_ON(!devlink_reload_supported(devlink)))
 				continue;
 			err = devlink_reload(devlink, &init_net, NULL);
-			if (err)
+			if (err && err != -EOPNOTSUPP)
 				pr_warn("Failed to reload devlink instance into init_net\n");
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From aef587be42925f92418083f08852d0011b2766ca Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Nov 2019 13:20:32 +0800
Subject: sctp: add pf_expose per netns and sock and asoc

As said in rfc7829, section 3, point 12:

  The SCTP stack SHOULD expose the PF state of its destination
  addresses to the ULP as well as provide the means to notify the
  ULP of state transitions of its destination addresses from
  active to PF, and vice versa.  However, it is recommended that
  an SCTP stack implementing SCTP-PF also allows for the ULP to be
  kept ignorant of the PF state of its destinations and the
  associated state transitions, thus allowing for retention of the
  simpler state transition model of [RFC4960] in the ULP.

Not only does it allow to expose the PF state to ULP, but also
allow to ignore sctp-pf to ULP.

So this patch is to add pf_expose per netns, sock and asoc. And in
sctp_assoc_control_transport(), ulp_notify will be set to false if
asoc->expose is not 'enabled' in next patch.

It also allows a user to change pf_expose per netns by sysctl, and
pf_expose per sock and asoc will be initialized with it.

Note that pf_expose also works for SCTP_GET_PEER_ADDR_INFO sockopt,
to not allow a user to query the state of a sctp-pf peer address
when pf_expose is 'disabled', as said in section 7.3.

v1->v2:
  - Fix a build warning noticed by Nathan Chancellor.
v2->v3:
  - set pf_expose to UNUSED by default to keep compatible with old
    applications.
v3->v4:
  - add a new entry for pf_expose on ip-sysctl.txt, as Marcelo suggested.
  - change this patch to 1/5, and move sctp_assoc_control_transport
    change into 2/5, as Marcelo suggested.
  - use SCTP_PF_EXPOSE_UNSET instead of SCTP_PF_EXPOSE_UNUSED, and
    set SCTP_PF_EXPOSE_UNSET to 0 in enum, as Marcelo suggested.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 22 ++++++++++++++++++++++
 include/net/netns/sctp.h               |  8 ++++++++
 include/net/sctp/constants.h           | 10 ++++++++++
 include/net/sctp/structs.h             |  2 ++
 include/uapi/linux/sctp.h              |  1 +
 net/sctp/associola.c                   |  1 +
 net/sctp/protocol.c                    |  3 +++
 net/sctp/socket.c                      | 13 +++++++++++--
 net/sctp/sysctl.c                      | 10 ++++++++++
 9 files changed, 68 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 8d4ad1d1ae26..0b0feb5b6b00 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -2091,6 +2091,28 @@ pf_enable - INTEGER
 
 	Default: 1
 
+pf_expose - INTEGER
+	Unset or enable/disable pf (pf is short for potentially failed) state
+	exposure.  Applications can control the exposure of the PF path state
+	in the SCTP_PEER_ADDR_CHANGE event and the SCTP_GET_PEER_ADDR_INFO
+	sockopt.   When it's unset, no SCTP_PEER_ADDR_CHANGE event with
+	SCTP_ADDR_PF state will be sent and a SCTP_PF-state transport info
+	can be got via SCTP_GET_PEER_ADDR_INFO sockopt;  When it's enabled,
+	a SCTP_PEER_ADDR_CHANGE event will be sent for a transport becoming
+	SCTP_PF state and a SCTP_PF-state transport info can be got via
+	SCTP_GET_PEER_ADDR_INFO sockopt;  When it's diabled, no
+	SCTP_PEER_ADDR_CHANGE event will be sent and it returns -EACCES when
+	trying to get a SCTP_PF-state transport info via SCTP_GET_PEER_ADDR_INFO
+	sockopt.
+
+	0: Unset pf state exposure, Compatible with old applications.
+
+	1: Disable pf state exposure.
+
+	2: Enable pf state exposure.
+
+	Default: 0
+
 addip_noauth_enable - BOOLEAN
 	Dynamic Address Reconfiguration (ADD-IP) requires the use of
 	authentication to protect the operations of adding or removing new
diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index bdc0f27b8514..18c3ddae77a3 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -96,6 +96,14 @@ struct netns_sctp {
 	 */
 	int pf_enable;
 
+	/*
+	 * Disable Potentially-Failed state exposure, ignored by default
+	 * pf_expose	-  0  : compatible with old applications (by default)
+	 *		-  1  : disable pf state exposure
+	 *		-  2  : enable  pf state exposure
+	 */
+	int pf_expose;
+
 	/*
 	 * Policy for preforming sctp/socket accounting
 	 * 0   - do socket level accounting, all assocs share sk_sndbuf
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 823afc42a3aa..e88b77a34cb1 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -286,6 +286,16 @@ enum { SCTP_MAX_GABS = 16 };
 				 * functions simpler to write.
 				 */
 
+/* These are the values for pf exposure, UNUSED is to keep compatible with old
+ * applications by default.
+ */
+enum {
+	SCTP_PF_EXPOSE_UNSET,
+	SCTP_PF_EXPOSE_DISABLE,
+	SCTP_PF_EXPOSE_ENABLE,
+};
+#define SCTP_PF_EXPOSE_MAX	SCTP_PF_EXPOSE_ENABLE
+
 /* These return values describe the success or failure of a number of
  * routines which form the lower interface to SCTP_outqueue.
  */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 503fbc3cd819..9a43738774d7 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -215,6 +215,7 @@ struct sctp_sock {
 	__u32 adaptation_ind;
 	__u32 pd_point;
 	__u16	nodelay:1,
+		pf_expose:2,
 		reuse:1,
 		disable_fragments:1,
 		v4mapped:1,
@@ -2053,6 +2054,7 @@ struct sctp_association {
 
 	__u8 need_ecne:1,	/* Need to send an ECNE Chunk? */
 	     temp:1,		/* Is it a temporary association? */
+	     pf_expose:2,       /* Expose pf state? */
 	     force_delay:1;
 
 	__u8 strreset_enable;
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6bce7f9837a9..765f41a080b4 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -933,6 +933,7 @@ struct sctp_paddrinfo {
 enum sctp_spinfo_state {
 	SCTP_INACTIVE,
 	SCTP_PF,
+#define	SCTP_POTENTIALLY_FAILED		SCTP_PF
 	SCTP_ACTIVE,
 	SCTP_UNCONFIRMED,
 	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 1b9809ad7725..3bf3380a5521 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -86,6 +86,7 @@ static struct sctp_association *sctp_association_init(
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
 	asoc->pf_retrans  = sp->pf_retrans;
+	asoc->pf_expose   = sp->pf_expose;
 
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 08d14d86ecfb..f86be7bf0972 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1220,6 +1220,9 @@ static int __net_init sctp_defaults_init(struct net *net)
 	/* Enable pf state by default */
 	net->sctp.pf_enable = 1;
 
+	/* Ignore pf exposure feature by default */
+	net->sctp.pf_expose = SCTP_PF_EXPOSE_UNSET;
+
 	/* Association.Max.Retrans  - 10 attempts
 	 * Path.Max.Retrans         - 5  attempts (per destination address)
 	 * Max.Init.Retransmits     - 8  attempts
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 53abb97e0061..318222e9c0a8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5041,6 +5041,7 @@ static int sctp_init_sock(struct sock *sk)
 	sp->hbinterval  = net->sctp.hb_interval;
 	sp->pathmaxrxt  = net->sctp.max_retrans_path;
 	sp->pf_retrans  = net->sctp.pf_retrans;
+	sp->pf_expose   = net->sctp.pf_expose;
 	sp->pathmtu     = 0; /* allow default discovery */
 	sp->sackdelay   = net->sctp.sack_timeout;
 	sp->sackfreq	= 2;
@@ -5521,8 +5522,16 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
 
 	transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address,
 					   pinfo.spinfo_assoc_id);
-	if (!transport)
-		return -EINVAL;
+	if (!transport) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	if (transport->state == SCTP_PF &&
+	    transport->asoc->pf_expose == SCTP_PF_EXPOSE_DISABLE) {
+		retval = -EACCES;
+		goto out;
+	}
 
 	pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
 	pinfo.spinfo_state = transport->state;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 238cf1737576..5d1ad44a29d1 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -34,6 +34,7 @@ static int rto_alpha_min = 0;
 static int rto_beta_min = 0;
 static int rto_alpha_max = 1000;
 static int rto_beta_max = 1000;
+static int pf_expose_max = SCTP_PF_EXPOSE_MAX;
 
 static unsigned long max_autoclose_min = 0;
 static unsigned long max_autoclose_max =
@@ -318,6 +319,15 @@ static struct ctl_table sctp_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "pf_expose",
+		.data		= &init_net.sctp.pf_expose,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &pf_expose_max,
+	},
 
 	{ /* sentinel */ }
 };
-- 
cgit v1.2.3-59-g8ed1b


From 768e15182dcb809e39c338290dda10c4e271d133 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Nov 2019 13:20:33 +0800
Subject: sctp: add SCTP_ADDR_POTENTIALLY_FAILED notification

SCTP Quick failover draft section 5.1, point 5 has been removed
from rfc7829. Instead, "the sender SHOULD (i) notify the Upper
Layer Protocol (ULP) about this state transition", as said in
section 3.2, point 8.

So this patch is to add SCTP_ADDR_POTENTIALLY_FAILED, defined
in section 7.1, "which is reported if the affected address
becomes PF". Also remove transport cwnd's update when moving
from PF back to ACTIVE , which is no longer in rfc7829 either.

Note that ulp_notify will be set to false if asoc->expose is
not 'enabled', according to last patch.

v2->v3:
  - define SCTP_ADDR_PF SCTP_ADDR_POTENTIALLY_FAILED.
v3->v4:
  - initialize spc_state with SCTP_ADDR_AVAILABLE, as Marcelo suggested.
  - check asoc->pf_expose in sctp_assoc_control_transport(), as Marcelo
    suggested.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  2 ++
 net/sctp/associola.c      | 32 ++++++++++++++------------------
 2 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 765f41a080b4..d99b428ac34e 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -410,6 +410,8 @@ enum sctp_spc_state {
 	SCTP_ADDR_ADDED,
 	SCTP_ADDR_MADE_PRIM,
 	SCTP_ADDR_CONFIRMED,
+	SCTP_ADDR_POTENTIALLY_FAILED,
+#define SCTP_ADDR_PF	SCTP_ADDR_POTENTIALLY_FAILED
 };
 
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 3bf3380a5521..ad7a74e942d3 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -787,8 +787,8 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 				  enum sctp_transport_cmd command,
 				  sctp_sn_error_t error)
 {
+	int spc_state = SCTP_ADDR_AVAILABLE;
 	bool ulp_notify = true;
-	int spc_state = 0;
 
 	/* Record the transition on the transport.  */
 	switch (command) {
@@ -797,19 +797,13 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		 * to heartbeat success, report the SCTP_ADDR_CONFIRMED
 		 * state to the user, otherwise report SCTP_ADDR_AVAILABLE.
 		 */
-		if (SCTP_UNCONFIRMED == transport->state &&
-		    SCTP_HEARTBEAT_SUCCESS == error)
-			spc_state = SCTP_ADDR_CONFIRMED;
-		else
-			spc_state = SCTP_ADDR_AVAILABLE;
-		/* Don't inform ULP about transition from PF to
-		 * active state and set cwnd to 1 MTU, see SCTP
-		 * Quick failover draft section 5.1, point 5
-		 */
-		if (transport->state == SCTP_PF) {
+		if (transport->state == SCTP_PF &&
+		    asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE)
 			ulp_notify = false;
-			transport->cwnd = asoc->pathmtu;
-		}
+		else if (transport->state == SCTP_UNCONFIRMED &&
+			 error == SCTP_HEARTBEAT_SUCCESS)
+			spc_state = SCTP_ADDR_CONFIRMED;
+
 		transport->state = SCTP_ACTIVE;
 		break;
 
@@ -818,19 +812,21 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 		 * to inactive state.  Also, release the cached route since
 		 * there may be a better route next time.
 		 */
-		if (transport->state != SCTP_UNCONFIRMED)
+		if (transport->state != SCTP_UNCONFIRMED) {
 			transport->state = SCTP_INACTIVE;
-		else {
+			spc_state = SCTP_ADDR_UNREACHABLE;
+		} else {
 			sctp_transport_dst_release(transport);
 			ulp_notify = false;
 		}
-
-		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
 	case SCTP_TRANSPORT_PF:
 		transport->state = SCTP_PF;
-		ulp_notify = false;
+		if (asoc->pf_expose != SCTP_PF_EXPOSE_ENABLE)
+			ulp_notify = false;
+		else
+			spc_state = SCTP_ADDR_POTENTIALLY_FAILED;
 		break;
 
 	default:
-- 
cgit v1.2.3-59-g8ed1b


From 8d2a6935d842f12c25611b165eace778adb09a53 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Nov 2019 13:20:34 +0800
Subject: sctp: add SCTP_EXPOSE_POTENTIALLY_FAILED_STATE sockopt

This is a sockopt defined in section 7.3 of rfc7829: "Exposing
the Potentially Failed Path State", by which users can change
pf_expose per sock and asoc.

The new sockopt SCTP_EXPOSE_POTENTIALLY_FAILED_STATE is also
known as SCTP_EXPOSE_PF_STATE for short.

v2->v3:
  - return -EINVAL if params.assoc_value > SCTP_PF_EXPOSE_MAX.
  - define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE.
v3->v4:
  - improve changelog.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h |  2 ++
 net/sctp/socket.c         | 79 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index d99b428ac34e..a190e4a7f546 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -137,6 +137,8 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ASCONF_SUPPORTED	128
 #define SCTP_AUTH_SUPPORTED	129
 #define SCTP_ECN_SUPPORTED	130
+#define SCTP_EXPOSE_POTENTIALLY_FAILED_STATE	131
+#define SCTP_EXPOSE_PF_STATE	SCTP_EXPOSE_POTENTIALLY_FAILED_STATE
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE	0x0000
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 318222e9c0a8..74c4e62ac741 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4589,6 +4589,40 @@ out:
 	return retval;
 }
 
+static int sctp_setsockopt_pf_expose(struct sock *sk,
+				     char __user *optval,
+				     unsigned int optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EINVAL;
+
+	if (optlen != sizeof(params))
+		goto out;
+
+	if (copy_from_user(&params, optval, optlen)) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	if (params.assoc_value > SCTP_PF_EXPOSE_MAX)
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+	    sctp_style(sk, UDP))
+		goto out;
+
+	if (asoc)
+		asoc->pf_expose = params.assoc_value;
+	else
+		sctp_sk(sk)->pf_expose = params.assoc_value;
+	retval = 0;
+
+out:
+	return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4798,6 +4832,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_ECN_SUPPORTED:
 		retval = sctp_setsockopt_ecn_supported(sk, optval, optlen);
 		break;
+	case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE:
+		retval = sctp_setsockopt_pf_expose(sk, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
@@ -7909,6 +7946,45 @@ out:
 	return retval;
 }
 
+static int sctp_getsockopt_pf_expose(struct sock *sk, int len,
+				     char __user *optval,
+				     int __user *optlen)
+{
+	struct sctp_assoc_value params;
+	struct sctp_association *asoc;
+	int retval = -EFAULT;
+
+	if (len < sizeof(params)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	len = sizeof(params);
+	if (copy_from_user(&params, optval, len))
+		goto out;
+
+	asoc = sctp_id2assoc(sk, params.assoc_id);
+	if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC &&
+	    sctp_style(sk, UDP)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	params.assoc_value = asoc ? asoc->pf_expose
+				  : sctp_sk(sk)->pf_expose;
+
+	if (put_user(len, optlen))
+		goto out;
+
+	if (copy_to_user(optval, &params, len))
+		goto out;
+
+	retval = 0;
+
+out:
+	return retval;
+}
+
 static int sctp_getsockopt(struct sock *sk, int level, int optname,
 			   char __user *optval, int __user *optlen)
 {
@@ -8121,6 +8197,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_ECN_SUPPORTED:
 		retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen);
 		break;
+	case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE:
+		retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen);
+		break;
 	default:
 		retval = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 34515e94c92c3f593cd696abca8609246cbd75e6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Nov 2019 13:20:35 +0800
Subject: sctp: add support for Primary Path Switchover

This is a new feature defined in section 5 of rfc7829: "Primary Path
Switchover". By introducing a new tunable parameter:

  Primary.Switchover.Max.Retrans (PSMR)

The primary path will be changed to another active path when the path
error counter on the old primary path exceeds PSMR, so that "the SCTP
sender is allowed to continue data transmission on a new working path
even when the old primary destination address becomes active again".

This patch is to add this tunable parameter, 'ps_retrans' per netns,
sock, asoc and transport. It also allows a user to change ps_retrans
per netns by sysctl, and ps_retrans per sock/asoc/transport will be
initialized with it.

The check will be done in sctp_do_8_2_transport_strike() when this
feature is enabled.

Note this feature is disabled by initializing 'ps_retrans' per netns
as 0xffff by default, and its value can't be less than 'pf_retrans'
when changing by sysctl.

v3->v4:
  - add define SCTP_PS_RETRANS_MAX 0xffff, and use it on extra2 of
    sysctl 'ps_retrans'.
  - add a new entry for ps_retrans on ip-sysctl.txt.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 12 ++++++++++++
 include/net/netns/sctp.h               |  6 ++++++
 include/net/sctp/constants.h           |  2 ++
 include/net/sctp/structs.h             | 11 ++++++++---
 net/sctp/associola.c                   |  3 +++
 net/sctp/protocol.c                    |  3 +++
 net/sctp/sm_sideeffect.c               |  5 +++++
 net/sctp/socket.c                      |  1 +
 net/sctp/sysctl.c                      | 12 +++++++++++-
 9 files changed, 51 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 0b0feb5b6b00..099a55bd1432 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -2195,6 +2195,18 @@ pf_retrans - INTEGER
 
 	Default: 0
 
+ps_retrans - INTEGER
+	Primary.Switchover.Max.Retrans (PSMR), it's a tunable parameter coming
+	from section-5 "Primary Path Switchover" in rfc7829.  The primary path
+	will be changed to another active path when the path error counter on
+	the old primary path exceeds PSMR, so that "the SCTP sender is allowed
+	to continue data transmission on a new working path even when the old
+	primary destination address becomes active again".   Note this feature
+	is disabled by initializing 'ps_retrans' per netns as 0xffff by default,
+	and its value can't be less than 'pf_retrans' when changing by sysctl.
+
+	Default: 0xffff
+
 rto_initial - INTEGER
 	The initial round trip timeout value in milliseconds that will be used
 	in calculating round trip times.  This is the initial time interval
diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 18c3ddae77a3..d8d02e4188d1 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -89,6 +89,12 @@ struct netns_sctp {
 	 */
 	int pf_retrans;
 
+	/* Primary.Switchover.Max.Retrans sysctl value
+	 * taken from:
+	 * https://tools.ietf.org/html/rfc7829
+	 */
+	int ps_retrans;
+
 	/*
 	 * Disable Potentially-Failed feature, the feature is enabled by default
 	 * pf_enable	-  0  : disable pf
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index e88b77a34cb1..15b4d9aec7ff 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -296,6 +296,8 @@ enum {
 };
 #define SCTP_PF_EXPOSE_MAX	SCTP_PF_EXPOSE_ENABLE
 
+#define SCTP_PS_RETRANS_MAX	0xffff
+
 /* These return values describe the success or failure of a number of
  * routines which form the lower interface to SCTP_outqueue.
  */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 9a43738774d7..3cc913f328cd 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -184,7 +184,8 @@ struct sctp_sock {
 	__u32 flowlabel;
 	__u8  dscp;
 
-	int pf_retrans;
+	__u16 pf_retrans;
+	__u16 ps_retrans;
 
 	/* The initial Path MTU to use for new associations. */
 	__u32 pathmtu;
@@ -897,7 +898,9 @@ struct sctp_transport {
 	 * and will be initialized from the assocs value.  This can be changed
 	 * using the SCTP_PEER_ADDR_THLDS socket option
 	 */
-	int pf_retrans;
+	__u16 pf_retrans;
+	/* Used for primary path switchover. */
+	__u16 ps_retrans;
 	/* PMTU	      : The current known path MTU.  */
 	__u32 pathmtu;
 
@@ -1773,7 +1776,9 @@ struct sctp_association {
 	 * and will be initialized from the assocs value.  This can be
 	 * changed using the SCTP_PEER_ADDR_THLDS socket option
 	 */
-	int pf_retrans;
+	__u16 pf_retrans;
+	/* Used for primary path switchover. */
+	__u16 ps_retrans;
 
 	/* Maximum number of times the endpoint will retransmit INIT  */
 	__u16 max_init_attempts;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index ad7a74e942d3..8f8d18abd013 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -86,6 +86,7 @@ static struct sctp_association *sctp_association_init(
 	 */
 	asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
 	asoc->pf_retrans  = sp->pf_retrans;
+	asoc->ps_retrans  = sp->ps_retrans;
 	asoc->pf_expose   = sp->pf_expose;
 
 	asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
@@ -628,6 +629,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 
 	/* And the partial failure retrans threshold */
 	peer->pf_retrans = asoc->pf_retrans;
+	/* And the primary path switchover retrans threshold */
+	peer->ps_retrans = asoc->ps_retrans;
 
 	/* Initialize the peer's SACK delay timeout based on the
 	 * association configured value.
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f86be7bf0972..fbbf19128c2d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1217,6 +1217,9 @@ static int __net_init sctp_defaults_init(struct net *net)
 	/* Max.Burst		    - 4 */
 	net->sctp.max_burst			= SCTP_DEFAULT_MAX_BURST;
 
+	/* Disable of Primary Path Switchover by default */
+	net->sctp.ps_retrans = SCTP_PS_RETRANS_MAX;
+
 	/* Enable pf state by default */
 	net->sctp.pf_enable = 1;
 
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index e52b2128e43b..acd737d4c0e0 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -567,6 +567,11 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands,
 					     SCTP_FAILED_THRESHOLD);
 	}
 
+	if (transport->error_count > transport->ps_retrans &&
+	    asoc->peer.primary_path == transport &&
+	    asoc->peer.active_path != transport)
+		sctp_assoc_set_primary(asoc, asoc->peer.active_path);
+
 	/* E2) For the destination address for which the timer
 	 * expires, set RTO <- RTO * 2 ("back off the timer").  The
 	 * maximum value discussed in rule C7 above (RTO.max) may be
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 74c4e62ac741..64452ee410da 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5078,6 +5078,7 @@ static int sctp_init_sock(struct sock *sk)
 	sp->hbinterval  = net->sctp.hb_interval;
 	sp->pathmaxrxt  = net->sctp.max_retrans_path;
 	sp->pf_retrans  = net->sctp.pf_retrans;
+	sp->ps_retrans  = net->sctp.ps_retrans;
 	sp->pf_expose   = net->sctp.pf_expose;
 	sp->pathmtu     = 0; /* allow default discovery */
 	sp->sackdelay   = net->sctp.sack_timeout;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 5d1ad44a29d1..4740aa70e652 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -35,6 +35,7 @@ static int rto_beta_min = 0;
 static int rto_alpha_max = 1000;
 static int rto_beta_max = 1000;
 static int pf_expose_max = SCTP_PF_EXPOSE_MAX;
+static int ps_retrans_max = SCTP_PS_RETRANS_MAX;
 
 static unsigned long max_autoclose_min = 0;
 static unsigned long max_autoclose_max =
@@ -213,7 +214,16 @@ static struct ctl_table sctp_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_INT_MAX,
+		.extra2		= &init_net.sctp.ps_retrans,
+	},
+	{
+		.procname	= "ps_retrans",
+		.data		= &init_net.sctp.ps_retrans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &init_net.sctp.pf_retrans,
+		.extra2		= &ps_retrans_max,
 	},
 	{
 		.procname	= "sndbuf_policy",
-- 
cgit v1.2.3-59-g8ed1b


From d467ac0a38551a5904878b1f5a2fe20a040c0e11 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 8 Nov 2019 13:20:36 +0800
Subject: sctp: add SCTP_PEER_ADDR_THLDS_V2 sockopt

Section 7.2 of rfc7829: "Peer Address Thresholds (SCTP_PEER_ADDR_THLDS)
Socket Option" extends 'struct sctp_paddrthlds' with 'spt_pathcpthld'
added to allow a user to change ps_retrans per sock/asoc/transport, as
other 2 paddrthlds: pf_retrans, pathmaxrxt.

Note: to not break the user's program, here to support pf_retrans dump
and setting by adding a new sockopt SCTP_PEER_ADDR_THLDS_V2, and a new
structure sctp_paddrthlds_v2 instead of extending sctp_paddrthlds.

Also, when setting ps_retrans, the value is not allowed to be greater
than pf_retrans.

v1->v2:
  - use SCTP_PEER_ADDR_THLDS_V2 to set/get pf_retrans instead,
    as Marcelo and David Laight suggested.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sctp.h | 10 +++++++++
 net/sctp/socket.c         | 54 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 50 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index a190e4a7f546..28ad40d9acba 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -105,6 +105,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_DEFAULT_SNDINFO	34
 #define SCTP_AUTH_DEACTIVATE_KEY	35
 #define SCTP_REUSE_PORT		36
+#define SCTP_PEER_ADDR_THLDS_V2	37
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
@@ -1087,6 +1088,15 @@ struct sctp_paddrthlds {
 	__u16 spt_pathpfthld;
 };
 
+/* Use a new structure with spt_pathcpthld for back compatibility */
+struct sctp_paddrthlds_v2 {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+	__u16 spt_pathcpthld;
+};
+
 /*
  * Socket Option for Getting the Association/Stream-Specific PR-SCTP Status
  */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 64452ee410da..83e4ca1fabda 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3943,18 +3943,22 @@ static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
  */
 static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
 					    char __user *optval,
-					    unsigned int optlen)
+					    unsigned int optlen, bool v2)
 {
-	struct sctp_paddrthlds val;
+	struct sctp_paddrthlds_v2 val;
 	struct sctp_transport *trans;
 	struct sctp_association *asoc;
+	int len;
 
-	if (optlen < sizeof(struct sctp_paddrthlds))
+	len = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds);
+	if (optlen < len)
 		return -EINVAL;
-	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
-			   sizeof(struct sctp_paddrthlds)))
+	if (copy_from_user(&val, optval, len))
 		return -EFAULT;
 
+	if (v2 && val.spt_pathpfthld > val.spt_pathcpthld)
+		return -EINVAL;
+
 	if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
 		trans = sctp_addr_id2transport(sk, &val.spt_address,
 					       val.spt_assoc_id);
@@ -3963,6 +3967,8 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
 
 		if (val.spt_pathmaxrxt)
 			trans->pathmaxrxt = val.spt_pathmaxrxt;
+		if (v2)
+			trans->ps_retrans = val.spt_pathcpthld;
 		trans->pf_retrans = val.spt_pathpfthld;
 
 		return 0;
@@ -3978,17 +3984,23 @@ static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
 				    transports) {
 			if (val.spt_pathmaxrxt)
 				trans->pathmaxrxt = val.spt_pathmaxrxt;
+			if (v2)
+				trans->ps_retrans = val.spt_pathcpthld;
 			trans->pf_retrans = val.spt_pathpfthld;
 		}
 
 		if (val.spt_pathmaxrxt)
 			asoc->pathmaxrxt = val.spt_pathmaxrxt;
+		if (v2)
+			asoc->ps_retrans = val.spt_pathcpthld;
 		asoc->pf_retrans = val.spt_pathpfthld;
 	} else {
 		struct sctp_sock *sp = sctp_sk(sk);
 
 		if (val.spt_pathmaxrxt)
 			sp->pathmaxrxt = val.spt_pathmaxrxt;
+		if (v2)
+			sp->ps_retrans = val.spt_pathcpthld;
 		sp->pf_retrans = val.spt_pathpfthld;
 	}
 
@@ -4778,7 +4790,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
 	case SCTP_PEER_ADDR_THLDS:
-		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen,
+							  false);
+		break;
+	case SCTP_PEER_ADDR_THLDS_V2:
+		retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen,
+							  true);
 		break;
 	case SCTP_RECVRCVINFO:
 		retval = sctp_setsockopt_recvrcvinfo(sk, optval, optlen);
@@ -7217,18 +7234,19 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
  * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
  */
 static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
-					    char __user *optval,
-					    int len,
-					    int __user *optlen)
+					    char __user *optval, int len,
+					    int __user *optlen, bool v2)
 {
-	struct sctp_paddrthlds val;
+	struct sctp_paddrthlds_v2 val;
 	struct sctp_transport *trans;
 	struct sctp_association *asoc;
+	int min;
 
-	if (len < sizeof(struct sctp_paddrthlds))
+	min = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds);
+	if (len < min)
 		return -EINVAL;
-	len = sizeof(struct sctp_paddrthlds);
-	if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len))
+	len = min;
+	if (copy_from_user(&val, optval, len))
 		return -EFAULT;
 
 	if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
@@ -7239,6 +7257,7 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
 
 		val.spt_pathmaxrxt = trans->pathmaxrxt;
 		val.spt_pathpfthld = trans->pf_retrans;
+		val.spt_pathcpthld = trans->ps_retrans;
 
 		goto out;
 	}
@@ -7251,11 +7270,13 @@ static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
 	if (asoc) {
 		val.spt_pathpfthld = asoc->pf_retrans;
 		val.spt_pathmaxrxt = asoc->pathmaxrxt;
+		val.spt_pathcpthld = asoc->ps_retrans;
 	} else {
 		struct sctp_sock *sp = sctp_sk(sk);
 
 		val.spt_pathpfthld = sp->pf_retrans;
 		val.spt_pathmaxrxt = sp->pathmaxrxt;
+		val.spt_pathcpthld = sp->ps_retrans;
 	}
 
 out:
@@ -8135,7 +8156,12 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 		retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
 		break;
 	case SCTP_PEER_ADDR_THLDS:
-		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen);
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len,
+							  optlen, false);
+		break;
+	case SCTP_PEER_ADDR_THLDS_V2:
+		retval = sctp_getsockopt_paddr_thresholds(sk, optval, len,
+							  optlen, true);
 		break;
 	case SCTP_GET_ASSOC_STATS:
 		retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen);
-- 
cgit v1.2.3-59-g8ed1b


From 727b3668b730634228fc65c336c2a7a080e02885 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 8 Nov 2019 17:39:29 +0000
Subject: net: sfp: rework upstream interface

The current upstream interface is an all-or-nothing, which is
sub-optimal for future changes, as it doesn't allow the upstream driver
to prepare for the SFP module becoming available, as it is at boot.

Switch to a find-sfp-bus, add-upstream, del-upstream, put-sfp-bus
interface structure instead, which allows the upstream driver to
prepare for a module being available as soon as add-upstream is called.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 10 +++---
 drivers/net/phy/sfp-bus.c | 92 +++++++++++++++++++++++++++++++++--------------
 include/linux/sfp.h       | 25 ++++++++-----
 3 files changed, 88 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index f16d9e92a81a..e3fbc8e93317 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -567,7 +567,7 @@ static int phylink_register_sfp(struct phylink *pl,
 	struct sfp_bus *bus;
 	int ret;
 
-	bus = sfp_register_upstream_node(fwnode, pl, &sfp_phylink_ops);
+	bus = sfp_bus_find_fwnode(fwnode);
 	if (IS_ERR(bus)) {
 		ret = PTR_ERR(bus);
 		phylink_err(pl, "unable to attach SFP bus: %d\n", ret);
@@ -576,7 +576,10 @@ static int phylink_register_sfp(struct phylink *pl,
 
 	pl->sfp_bus = bus;
 
-	return 0;
+	ret = sfp_bus_add_upstream(bus, pl, &sfp_phylink_ops);
+	sfp_bus_put(bus);
+
+	return ret;
 }
 
 /**
@@ -670,8 +673,7 @@ EXPORT_SYMBOL_GPL(phylink_create);
  */
 void phylink_destroy(struct phylink *pl)
 {
-	if (pl->sfp_bus)
-		sfp_unregister_upstream(pl->sfp_bus);
+	sfp_bus_del_upstream(pl->sfp_bus);
 	if (pl->link_gpio)
 		gpiod_put(pl->link_gpio);
 
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index d037aab6a71d..715d45214e18 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -329,10 +329,19 @@ static void sfp_bus_release(struct kref *kref)
 	kfree(bus);
 }
 
-static void sfp_bus_put(struct sfp_bus *bus)
+/**
+ * sfp_bus_put() - put a reference on the &struct sfp_bus
+ * bus: the &struct sfp_bus found via sfp_bus_find_fwnode()
+ *
+ * Put a reference on the &struct sfp_bus and free the underlying structure
+ * if this was the last reference.
+ */
+void sfp_bus_put(struct sfp_bus *bus)
 {
-	kref_put_mutex(&bus->kref, sfp_bus_release, &sfp_mutex);
+	if (bus)
+		kref_put_mutex(&bus->kref, sfp_bus_release, &sfp_mutex);
 }
+EXPORT_SYMBOL_GPL(sfp_bus_put);
 
 static int sfp_register_bus(struct sfp_bus *bus)
 {
@@ -348,11 +357,11 @@ static int sfp_register_bus(struct sfp_bus *bus)
 				return ret;
 		}
 	}
+	bus->registered = true;
 	bus->socket_ops->attach(bus->sfp);
 	if (bus->started)
 		bus->socket_ops->start(bus->sfp);
 	bus->upstream_ops->attach(bus->upstream, bus);
-	bus->registered = true;
 	return 0;
 }
 
@@ -446,13 +455,12 @@ static void sfp_upstream_clear(struct sfp_bus *bus)
 }
 
 /**
- * sfp_register_upstream_node() - parse and register the neighbouring device
+ * sfp_bus_find_fwnode() - parse and locate the SFP bus from fwnode
  * @fwnode: firmware node for the parent device (MAC or PHY)
- * @upstream: the upstream private data
- * @ops: the upstream's &struct sfp_upstream_ops
  *
- * Parse the parent device's firmware node for a SFP bus, and register the
- * SFP bus using sfp_register_upstream().
+ * Parse the parent device's firmware node for a SFP bus, and locate
+ * the sfp_bus structure, incrementing its reference count.  This must
+ * be put via sfp_bus_put() when done.
  *
  * Returns: on success, a pointer to the sfp_bus structure,
  *	    %NULL if no SFP is specified,
@@ -462,9 +470,7 @@ static void sfp_upstream_clear(struct sfp_bus *bus)
  * 	        %-ENOMEM if we failed to allocate the bus.
  *		an error from the upstream's connect_phy() method.
  */
-struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode,
-					   void *upstream,
-					   const struct sfp_upstream_ops *ops)
+struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode)
 {
 	struct fwnode_reference_args ref;
 	struct sfp_bus *bus;
@@ -482,7 +488,39 @@ struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode,
 	if (!bus)
 		return ERR_PTR(-ENOMEM);
 
+	return bus;
+}
+EXPORT_SYMBOL_GPL(sfp_bus_find_fwnode);
+
+/**
+ * sfp_bus_add_upstream() - parse and register the neighbouring device
+ * @bus: the &struct sfp_bus found via sfp_bus_find_fwnode()
+ * @upstream: the upstream private data
+ * @ops: the upstream's &struct sfp_upstream_ops
+ *
+ * Add upstream driver for the SFP bus, and if the bus is complete, register
+ * the SFP bus using sfp_register_upstream().  This takes a reference on the
+ * bus, so it is safe to put the bus after this call.
+ *
+ * Returns: on success, a pointer to the sfp_bus structure,
+ *	    %NULL if no SFP is specified,
+ * 	    on failure, an error pointer value:
+ * 		corresponding to the errors detailed for
+ * 		fwnode_property_get_reference_args().
+ * 	        %-ENOMEM if we failed to allocate the bus.
+ *		an error from the upstream's connect_phy() method.
+ */
+int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
+			 const struct sfp_upstream_ops *ops)
+{
+	int ret;
+
+	/* If no bus, return success */
+	if (!bus)
+		return 0;
+
 	rtnl_lock();
+	kref_get(&bus->kref);
 	bus->upstream_ops = ops;
 	bus->upstream = upstream;
 
@@ -495,33 +533,33 @@ struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode,
 	}
 	rtnl_unlock();
 
-	if (ret) {
+	if (ret)
 		sfp_bus_put(bus);
-		bus = ERR_PTR(ret);
-	}
 
-	return bus;
+	return ret;
 }
-EXPORT_SYMBOL_GPL(sfp_register_upstream_node);
+EXPORT_SYMBOL_GPL(sfp_bus_add_upstream);
 
 /**
- * sfp_unregister_upstream() - Unregister sfp bus
+ * sfp_bus_del_upstream() - Delete a sfp bus
  * @bus: a pointer to the &struct sfp_bus structure for the sfp module
  *
- * Unregister a previously registered upstream connection for the SFP
- * module. @bus is returned from sfp_register_upstream().
+ * Delete a previously registered upstream connection for the SFP
+ * module. @bus should have been added by sfp_bus_add_upstream().
  */
-void sfp_unregister_upstream(struct sfp_bus *bus)
+void sfp_bus_del_upstream(struct sfp_bus *bus)
 {
-	rtnl_lock();
-	if (bus->sfp)
-		sfp_unregister_bus(bus);
-	sfp_upstream_clear(bus);
-	rtnl_unlock();
+	if (bus) {
+		rtnl_lock();
+		if (bus->sfp)
+			sfp_unregister_bus(bus);
+		sfp_upstream_clear(bus);
+		rtnl_unlock();
 
-	sfp_bus_put(bus);
+		sfp_bus_put(bus);
+	}
 }
-EXPORT_SYMBOL_GPL(sfp_unregister_upstream);
+EXPORT_SYMBOL_GPL(sfp_bus_del_upstream);
 
 /* Socket driver entry points */
 int sfp_add_phy(struct sfp_bus *bus, struct phy_device *phydev)
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 355a08a76fd4..c8464de7cff5 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -508,10 +508,11 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee,
 			  u8 *data);
 void sfp_upstream_start(struct sfp_bus *bus);
 void sfp_upstream_stop(struct sfp_bus *bus);
-struct sfp_bus *sfp_register_upstream_node(struct fwnode_handle *fwnode,
-					   void *upstream,
-					   const struct sfp_upstream_ops *ops);
-void sfp_unregister_upstream(struct sfp_bus *bus);
+void sfp_bus_put(struct sfp_bus *bus);
+struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode);
+int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
+			 const struct sfp_upstream_ops *ops);
+void sfp_bus_del_upstream(struct sfp_bus *bus);
 #else
 static inline int sfp_parse_port(struct sfp_bus *bus,
 				 const struct sfp_eeprom_id *id,
@@ -553,14 +554,22 @@ static inline void sfp_upstream_stop(struct sfp_bus *bus)
 {
 }
 
-static inline struct sfp_bus *sfp_register_upstream_node(
-	struct fwnode_handle *fwnode, void *upstream,
-	const struct sfp_upstream_ops *ops)
+static inline void sfp_bus_put(struct sfp_bus *bus)
+{
+}
+
+static inline struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode)
 {
 	return NULL;
 }
 
-static inline void sfp_unregister_upstream(struct sfp_bus *bus)
+static int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
+				const struct sfp_upstream_ops *ops)
+{
+	return 0;
+}
+
+static inline void sfp_bus_del_upstream(struct sfp_bus *bus)
 {
 }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 6c7295e13ffd5623b02f1adc1442f1d8a3d52424 Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@mellanox.com>
Date: Fri, 8 Nov 2019 23:45:20 +0000
Subject: devlink: Add new "enable_roce" generic device param

New device parameter to enable/disable handling of RoCE traffic in the
device.

Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 Documentation/networking/devlink-params.txt | 4 ++++
 include/net/devlink.h                       | 4 ++++
 net/core/devlink.c                          | 5 +++++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink-params.txt b/Documentation/networking/devlink-params.txt
index ddba3e9b55b1..04e234e9acc9 100644
--- a/Documentation/networking/devlink-params.txt
+++ b/Documentation/networking/devlink-params.txt
@@ -65,3 +65,7 @@ reset_dev_on_drv_probe	[DEVICE, GENERIC]
 			  Reset only if device firmware can be found in the
 			  filesystem.
 			Type: u8
+
+enable_roce		[DEVICE, GENERIC]
+			Enable handling of RoCE traffic in the device.
+			Type: Boolean
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 23e4b65ec9df..39fb4d957838 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -400,6 +400,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
 	DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY,
 	DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE,
+	DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -434,6 +435,9 @@ enum devlink_param_generic_id {
 	"reset_dev_on_drv_probe"
 #define DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE DEVLINK_PARAM_TYPE_U8
 
+#define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME "enable_roce"
+#define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE DEVLINK_PARAM_TYPE_BOOL
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/core/devlink.c b/net/core/devlink.c
index f80151eeaf51..0fbcd44aa64f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2884,6 +2884,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME,
 		.type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+		.name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME,
+		.type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3-59-g8ed1b


From cc9defcbb8fae52810f7795b039223edae51ef95 Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@mellanox.com>
Date: Fri, 8 Nov 2019 23:45:24 +0000
Subject: net/mlx5: Handle "enable_roce" devlink param

Register "enable_roce" param, default value is RoCE enabled.
Current configuration is stored on mlx5_core_dev and exposed to user
through the cmode runtime devlink param.
Changing configuration requires changing the cmode driverinit devlink
param and calling devlink reload.

Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../networking/device_drivers/mellanox/mlx5.rst    | 21 +++++++++++++++++++++
 Documentation/networking/devlink-params-mlx5.txt   |  5 +++++
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c  | 22 ++++++++++++++++++++++
 include/linux/mlx5/driver.h                        | 11 +++++++++++
 4 files changed, 59 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/mellanox/mlx5.rst b/Documentation/networking/device_drivers/mellanox/mlx5.rst
index d071c6b49e1f..7599dceba9f1 100644
--- a/Documentation/networking/device_drivers/mellanox/mlx5.rst
+++ b/Documentation/networking/device_drivers/mellanox/mlx5.rst
@@ -154,6 +154,27 @@ User command examples:
       values:
          cmode runtime value smfs
 
+enable_roce: RoCE enablement state
+----------------------------------
+RoCE enablement state controls driver support for RoCE traffic.
+When RoCE is disabled, there is no gid table, only raw ethernet QPs are supported and traffic on the well known UDP RoCE port is handled as raw ethernet traffic.
+
+To change RoCE enablement state a user must change the driverinit cmode value and run devlink reload.
+
+User command examples:
+
+- Disable RoCE::
+
+    $ devlink dev param set pci/0000:06:00.0 name enable_roce value false cmode driverinit
+    $ devlink dev reload pci/0000:06:00.0
+
+- Read RoCE enablement state::
+
+    $ devlink dev param show pci/0000:06:00.0 name enable_roce
+      pci/0000:06:00.0:
+      name enable_roce type generic
+      values:
+         cmode driverinit value true
 
 Devlink health reporters
 ========================
diff --git a/Documentation/networking/devlink-params-mlx5.txt b/Documentation/networking/devlink-params-mlx5.txt
index 8c0b82d655dc..5071467118bd 100644
--- a/Documentation/networking/devlink-params-mlx5.txt
+++ b/Documentation/networking/devlink-params-mlx5.txt
@@ -10,3 +10,8 @@ flow_steering_mode	[DEVICE, DRIVER-SPECIFIC]
 			without firmware intervention.
 			Type: String
 			Configuration mode: runtime
+
+enable_roce		[DEVICE, GENERIC]
+			Enable handling of RoCE traffic in the device.
+			Defaultly enabled.
+			Configuration mode: driverinit
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 381925c90d94..b2c26388edb1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -177,12 +177,29 @@ enum mlx5_devlink_param_id {
 	MLX5_DEVLINK_PARAM_FLOW_STEERING_MODE,
 };
 
+static int mlx5_devlink_enable_roce_validate(struct devlink *devlink, u32 id,
+					     union devlink_param_value val,
+					     struct netlink_ext_ack *extack)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	bool new_state = val.vbool;
+
+	if (new_state && !MLX5_CAP_GEN(dev, roce)) {
+		NL_SET_ERR_MSG_MOD(extack, "Device doesn't support RoCE");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static const struct devlink_param mlx5_devlink_params[] = {
 	DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_FLOW_STEERING_MODE,
 			     "flow_steering_mode", DEVLINK_PARAM_TYPE_STRING,
 			     BIT(DEVLINK_PARAM_CMODE_RUNTIME),
 			     mlx5_devlink_fs_mode_get, mlx5_devlink_fs_mode_set,
 			     mlx5_devlink_fs_mode_validate),
+	DEVLINK_PARAM_GENERIC(ENABLE_ROCE, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			      NULL, NULL, mlx5_devlink_enable_roce_validate),
 };
 
 static void mlx5_devlink_set_params_init_values(struct devlink *devlink)
@@ -197,6 +214,11 @@ static void mlx5_devlink_set_params_init_values(struct devlink *devlink)
 	devlink_param_driverinit_value_set(devlink,
 					   MLX5_DEVLINK_PARAM_FLOW_STEERING_MODE,
 					   value);
+
+	value.vbool = MLX5_CAP_GEN(dev, roce);
+	devlink_param_driverinit_value_set(devlink,
+					   DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+					   value);
 }
 
 int mlx5_devlink_register(struct devlink *devlink, struct device *dev)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7b4801e96feb..1884513aac90 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1191,4 +1191,15 @@ enum {
 	MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
 };
 
+static inline bool mlx5_is_roce_enabled(struct mlx5_core_dev *dev)
+{
+	struct devlink *devlink = priv_to_devlink(dev);
+	union devlink_param_value val;
+
+	devlink_param_driverinit_value_get(devlink,
+					   DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+					   &val);
+	return val.vbool;
+}
+
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3-59-g8ed1b


From 50ec88120ea16cf8b9aabf8422c364166ce3ee17 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 8 Oct 2019 19:20:39 +0300
Subject: can: mcp251x: get rid of legacy platform data

Instead of using legacy platform data, switch to use device properties.
For clock frequency we are using well established clock-frequency property.

Users, two for now, are also converted here.

Cc: Daniel Mack <daniel@zonque.org>
Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Cc: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: Russell King <linux@armlinux.org.uk>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 arch/arm/mach-pxa/icontrol.c         |  9 +++++----
 arch/arm/mach-pxa/zeus.c             |  9 +++++----
 drivers/net/can/spi/mcp251x.c        |  9 ++++-----
 include/linux/can/platform/mcp251x.h | 22 ----------------------
 4 files changed, 14 insertions(+), 35 deletions(-)
 delete mode 100644 include/linux/can/platform/mcp251x.h

(limited to 'include')

diff --git a/arch/arm/mach-pxa/icontrol.c b/arch/arm/mach-pxa/icontrol.c
index 865b10344ea2..0474a4b1394d 100644
--- a/arch/arm/mach-pxa/icontrol.c
+++ b/arch/arm/mach-pxa/icontrol.c
@@ -12,6 +12,7 @@
 
 #include <linux/irq.h>
 #include <linux/platform_device.h>
+#include <linux/property.h>
 #include <linux/gpio.h>
 
 #include <asm/mach-types.h>
@@ -22,7 +23,6 @@
 
 #include <linux/spi/spi.h>
 #include <linux/spi/pxa2xx_spi.h>
-#include <linux/can/platform/mcp251x.h>
 #include <linux/regulator/machine.h>
 
 #include "generic.h"
@@ -69,8 +69,9 @@ static struct pxa2xx_spi_chip mcp251x_chip_info4 = {
 	.gpio_cs        = ICONTROL_MCP251x_nCS4
 };
 
-static struct mcp251x_platform_data mcp251x_info = {
-	.oscillator_frequency = 16E6,
+static const struct property_entry mcp251x_properties[] = {
+	PROPERTY_ENTRY_U32("clock-frequency", 16000000),
+	{}
 };
 
 static struct spi_board_info mcp251x_board_info[] = {
@@ -79,7 +80,7 @@ static struct spi_board_info mcp251x_board_info[] = {
 		.max_speed_hz    = 6500000,
 		.bus_num         = 3,
 		.chip_select     = 0,
-		.platform_data   = &mcp251x_info,
+		.properties      = mcp251x_properties,
 		.controller_data = &mcp251x_chip_info1,
 		.irq             = PXA_GPIO_TO_IRQ(ICONTROL_MCP251x_nIRQ1)
 	},
diff --git a/arch/arm/mach-pxa/zeus.c b/arch/arm/mach-pxa/zeus.c
index da113c8eefbf..b27fc7ac9cea 100644
--- a/arch/arm/mach-pxa/zeus.c
+++ b/arch/arm/mach-pxa/zeus.c
@@ -13,6 +13,7 @@
 #include <linux/leds.h>
 #include <linux/irq.h>
 #include <linux/pm.h>
+#include <linux/property.h>
 #include <linux/gpio.h>
 #include <linux/gpio/machine.h>
 #include <linux/serial_8250.h>
@@ -27,7 +28,6 @@
 #include <linux/platform_data/i2c-pxa.h>
 #include <linux/platform_data/pca953x.h>
 #include <linux/apm-emulation.h>
-#include <linux/can/platform/mcp251x.h>
 #include <linux/regulator/fixed.h>
 #include <linux/regulator/machine.h>
 
@@ -428,14 +428,15 @@ static struct gpiod_lookup_table can_regulator_gpiod_table = {
 	},
 };
 
-static struct mcp251x_platform_data zeus_mcp2515_pdata = {
-	.oscillator_frequency	= 16*1000*1000,
+static const struct property_entry mcp251x_properties[] = {
+	PROPERTY_ENTRY_U32("clock-frequency", 16000000),
+	{}
 };
 
 static struct spi_board_info zeus_spi_board_info[] = {
 	[0] = {
 		.modalias	= "mcp2515",
-		.platform_data	= &zeus_mcp2515_pdata,
+		.properties	= mcp251x_properties,
 		.irq		= PXA_GPIO_TO_IRQ(ZEUS_CAN_GPIO),
 		.max_speed_hz	= 1*1000*1000,
 		.bus_num	= 3,
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index bb20a9b75cc6..ee2e97da4e1d 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -22,7 +22,6 @@
 #include <linux/can/core.h>
 #include <linux/can/dev.h>
 #include <linux/can/led.h>
-#include <linux/can/platform/mcp251x.h>
 #include <linux/clk.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
@@ -986,19 +985,19 @@ MODULE_DEVICE_TABLE(spi, mcp251x_id_table);
 static int mcp251x_can_probe(struct spi_device *spi)
 {
 	const void *match = device_get_match_data(&spi->dev);
-	struct mcp251x_platform_data *pdata = dev_get_platdata(&spi->dev);
 	struct net_device *net;
 	struct mcp251x_priv *priv;
 	struct clk *clk;
-	int freq, ret;
+	u32 freq;
+	int ret;
 
 	clk = devm_clk_get_optional(&spi->dev, NULL);
 	if (IS_ERR(clk))
 		return PTR_ERR(clk);
 
 	freq = clk_get_rate(clk);
-	if (freq == 0 && pdata)
-		freq = pdata->oscillator_frequency;
+	if (freq == 0)
+		device_property_read_u32(&spi->dev, "clock-frequency", &freq);
 
 	/* Sanity check */
 	if (freq < 1000000 || freq > 25000000)
diff --git a/include/linux/can/platform/mcp251x.h b/include/linux/can/platform/mcp251x.h
deleted file mode 100644
index 9e5ac27fb6c1..000000000000
--- a/include/linux/can/platform/mcp251x.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _CAN_PLATFORM_MCP251X_H
-#define _CAN_PLATFORM_MCP251X_H
-
-/*
- *
- * CAN bus driver for Microchip 251x CAN Controller with SPI Interface
- *
- */
-
-#include <linux/spi/spi.h>
-
-/*
- * struct mcp251x_platform_data - MCP251X SPI CAN controller platform data
- * @oscillator_frequency:       - oscillator frequency in Hz
- */
-
-struct mcp251x_platform_data {
-	unsigned long oscillator_frequency;
-};
-
-#endif /* !_CAN_PLATFORM_MCP251X_H */
-- 
cgit v1.2.3-59-g8ed1b


From 61d2350615c2c42f7af65d9a575f5dbf9738a10e Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Mon, 7 Oct 2019 13:36:58 +0200
Subject: can: rx-offload: can_rx_offload_reset(): remove no-op function

This patch removes the function can_rx_offload_reset(), as it does
nothing. If we ever need this function, add it back again.

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/rx-offload.c   | 7 -------
 include/linux/can/rx-offload.h | 1 -
 2 files changed, 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c
index 412a8cec5d18..8d120a3750fa 100644
--- a/drivers/net/can/rx-offload.c
+++ b/drivers/net/can/rx-offload.c
@@ -348,7 +348,6 @@ static int can_rx_offload_init_queue(struct net_device *dev,
 	offload->skb_queue_len_max *= 4;
 	skb_queue_head_init(&offload->skb_queue);
 
-	can_rx_offload_reset(offload);
 	netif_napi_add(dev, &offload->napi, can_rx_offload_napi_poll, weight);
 
 	dev_dbg(dev->dev.parent, "%s: skb_queue_len_max=%d\n",
@@ -390,7 +389,6 @@ EXPORT_SYMBOL_GPL(can_rx_offload_add_fifo);
 
 void can_rx_offload_enable(struct can_rx_offload *offload)
 {
-	can_rx_offload_reset(offload);
 	napi_enable(&offload->napi);
 }
 EXPORT_SYMBOL_GPL(can_rx_offload_enable);
@@ -401,8 +399,3 @@ void can_rx_offload_del(struct can_rx_offload *offload)
 	skb_queue_purge(&offload->skb_queue);
 }
 EXPORT_SYMBOL_GPL(can_rx_offload_del);
-
-void can_rx_offload_reset(struct can_rx_offload *offload)
-{
-}
-EXPORT_SYMBOL_GPL(can_rx_offload_reset);
diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h
index 01219f2902bf..fc75e9a7ad2f 100644
--- a/include/linux/can/rx-offload.h
+++ b/include/linux/can/rx-offload.h
@@ -44,7 +44,6 @@ unsigned int can_rx_offload_get_echo_skb(struct can_rx_offload *offload,
 					 unsigned int idx, u32 timestamp);
 int can_rx_offload_queue_tail(struct can_rx_offload *offload,
 			      struct sk_buff *skb);
-void can_rx_offload_reset(struct can_rx_offload *offload);
 void can_rx_offload_del(struct can_rx_offload *offload);
 void can_rx_offload_enable(struct can_rx_offload *offload);
 
-- 
cgit v1.2.3-59-g8ed1b


From 4e9c9484b085dbba60b299182dd490eaeb84d18a Mon Sep 17 00:00:00 2001
From: Joakim Zhang <qiangqing.zhang@nxp.com>
Date: Fri, 12 Jul 2019 08:02:38 +0000
Subject: can: rx-offload: Prepare for CAN FD support

The skbs for classic CAN and CAN FD frames are allocated with seperate
functions: alloc_can_skb() and alloc_canfd_skb().

In order to support CAN FD frames via the rx-offload helper, the driver
itself has to allocate the skb (depending whether it received a classic
CAN or CAN FD frame), as the rx-offload helper cannot know which kind of
CAN frame the driver has received.

This patch moves the allocation of the skb into the struct
can_rx_offload::mailbox_read callbacks of the the flexcan and ti_hecc
driver and adjusts the rx-offload helper accordingly.

Signed-off-by: Joakim Zhang <qiangqing.zhang@nxp.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/flexcan.c      | 27 +++++++++++-----
 drivers/net/can/rx-offload.c   | 70 ++++++++++--------------------------------
 drivers/net/can/ti_hecc.c      | 26 ++++++++++++----
 include/linux/can/rx-offload.h |  6 ++--
 4 files changed, 60 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 57f9a2f51085..1a8198163b80 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -783,16 +783,23 @@ static inline struct flexcan_priv *rx_offload_to_priv(struct can_rx_offload *off
 	return container_of(offload, struct flexcan_priv, offload);
 }
 
-static unsigned int flexcan_mailbox_read(struct can_rx_offload *offload,
-					 struct can_frame *cf,
-					 u32 *timestamp, unsigned int n)
+static struct sk_buff *flexcan_mailbox_read(struct can_rx_offload *offload,
+					    unsigned int n, u32 *timestamp,
+					    bool drop)
 {
 	struct flexcan_priv *priv = rx_offload_to_priv(offload);
 	struct flexcan_regs __iomem *regs = priv->regs;
 	struct flexcan_mb __iomem *mb;
+	struct sk_buff *skb;
+	struct can_frame *cf;
 	u32 reg_ctrl, reg_id, reg_iflag1;
 	int i;
 
+	if (unlikely(drop)) {
+		skb = ERR_PTR(-ENOBUFS);
+		goto mark_as_read;
+	}
+
 	mb = flexcan_get_mb(priv, n);
 
 	if (priv->devtype_data->quirks & FLEXCAN_QUIRK_USE_OFF_TIMESTAMP) {
@@ -806,7 +813,7 @@ static unsigned int flexcan_mailbox_read(struct can_rx_offload *offload,
 		code = reg_ctrl & FLEXCAN_MB_CODE_MASK;
 		if ((code != FLEXCAN_MB_CODE_RX_FULL) &&
 		    (code != FLEXCAN_MB_CODE_RX_OVERRUN))
-			return 0;
+			return NULL;
 
 		if (code == FLEXCAN_MB_CODE_RX_OVERRUN) {
 			/* This MB was overrun, we lost data */
@@ -816,11 +823,17 @@ static unsigned int flexcan_mailbox_read(struct can_rx_offload *offload,
 	} else {
 		reg_iflag1 = priv->read(&regs->iflag1);
 		if (!(reg_iflag1 & FLEXCAN_IFLAG_RX_FIFO_AVAILABLE))
-			return 0;
+			return NULL;
 
 		reg_ctrl = priv->read(&mb->can_ctrl);
 	}
 
+	skb = alloc_can_skb(offload->dev, &cf);
+	if (!skb) {
+		skb = ERR_PTR(-ENOMEM);
+		goto mark_as_read;
+	}
+
 	/* increase timstamp to full 32 bit */
 	*timestamp = reg_ctrl << 16;
 
@@ -839,7 +852,7 @@ static unsigned int flexcan_mailbox_read(struct can_rx_offload *offload,
 		*(__be32 *)(cf->data + i) = data;
 	}
 
-	/* mark as read */
+ mark_as_read:
 	if (priv->devtype_data->quirks & FLEXCAN_QUIRK_USE_OFF_TIMESTAMP) {
 		/* Clear IRQ */
 		if (n < 32)
@@ -856,7 +869,7 @@ static unsigned int flexcan_mailbox_read(struct can_rx_offload *offload,
 	 */
 	priv->read(&regs->timer);
 
-	return 1;
+	return skb;
 }
 
 
diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c
index 8d120a3750fa..e8328910a234 100644
--- a/drivers/net/can/rx-offload.c
+++ b/drivers/net/can/rx-offload.c
@@ -139,71 +139,35 @@ static int can_rx_offload_compare(struct sk_buff *a, struct sk_buff *b)
 static struct sk_buff *
 can_rx_offload_offload_one(struct can_rx_offload *offload, unsigned int n)
 {
-	struct sk_buff *skb = NULL, *skb_error = NULL;
+	struct sk_buff *skb;
 	struct can_rx_offload_cb *cb;
-	struct can_frame *cf;
-	int ret;
-
-	if (likely(skb_queue_len(&offload->skb_queue) <
-		   offload->skb_queue_len_max)) {
-		skb = alloc_can_skb(offload->dev, &cf);
-		if (unlikely(!skb))
-			skb_error = ERR_PTR(-ENOMEM);	/* skb alloc failed */
-	} else {
-		skb_error = ERR_PTR(-ENOBUFS);		/* skb_queue is full */
-	}
-
-	/* If queue is full or skb not available, drop by reading into
-	 * overflow buffer.
-	 */
-	if (unlikely(skb_error)) {
-		struct can_frame cf_overflow;
-		u32 timestamp;
-
-		ret = offload->mailbox_read(offload, &cf_overflow,
-					    &timestamp, n);
-
-		/* Mailbox was empty. */
-		if (unlikely(!ret))
-			return NULL;
-
-		/* Mailbox has been read and we're dropping it or
-		 * there was a problem reading the mailbox.
-		 *
-		 * Increment error counters in any case.
-		 */
-		offload->dev->stats.rx_dropped++;
-		offload->dev->stats.rx_fifo_errors++;
-
-		/* There was a problem reading the mailbox, propagate
-		 * error value.
-		 */
-		if (unlikely(ret < 0))
-			return ERR_PTR(ret);
-
-		return skb_error;
-	}
+	bool drop = false;
+	u32 timestamp;
 
-	cb = can_rx_offload_get_cb(skb);
-	ret = offload->mailbox_read(offload, cf, &cb->timestamp, n);
+	/* If queue is full drop frame */
+	if (unlikely(skb_queue_len(&offload->skb_queue) >
+		     offload->skb_queue_len_max))
+		drop = true;
 
+	skb = offload->mailbox_read(offload, n, &timestamp, drop);
 	/* Mailbox was empty. */
-	if (unlikely(!ret)) {
-		kfree_skb(skb);
+	if (unlikely(!skb))
 		return NULL;
-	}
-
-	/* There was a problem reading the mailbox, propagate error value. */
-	if (unlikely(ret < 0)) {
-		kfree_skb(skb);
 
+	/* There was a problem reading the mailbox, propagate
+	 * error value.
+	 */
+	if (unlikely(IS_ERR(skb))) {
 		offload->dev->stats.rx_dropped++;
 		offload->dev->stats.rx_fifo_errors++;
 
-		return ERR_PTR(ret);
+		return skb;
 	}
 
 	/* Mailbox was read. */
+	cb = can_rx_offload_get_cb(skb);
+	cb->timestamp = timestamp;
+
 	return skb;
 }
 
diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index 31ad364a89bb..94b1491b569f 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -535,15 +535,28 @@ struct ti_hecc_priv *rx_offload_to_priv(struct can_rx_offload *offload)
 	return container_of(offload, struct ti_hecc_priv, offload);
 }
 
-static unsigned int ti_hecc_mailbox_read(struct can_rx_offload *offload,
-					 struct can_frame *cf,
-					 u32 *timestamp, unsigned int mbxno)
+static struct sk_buff *ti_hecc_mailbox_read(struct can_rx_offload *offload,
+					    unsigned int mbxno, u32 *timestamp,
+					    bool drop)
 {
 	struct ti_hecc_priv *priv = rx_offload_to_priv(offload);
+	struct sk_buff *skb;
+	struct can_frame *cf;
 	u32 data, mbx_mask;
-	int ret = 1;
 
 	mbx_mask = BIT(mbxno);
+
+	if (unlikely(drop)) {
+		skb = ERR_PTR(-ENOBUFS);
+		goto mark_as_read;
+	}
+
+	skb = alloc_can_skb(offload->dev, &cf);
+	if (unlikely(!skb)) {
+		skb = ERR_PTR(-ENOMEM);
+		goto mark_as_read;
+	}
+
 	data = hecc_read_mbx(priv, mbxno, HECC_CANMID);
 	if (data & HECC_CANMID_IDE)
 		cf->can_id = (data & CAN_EFF_MASK) | CAN_EFF_FLAG;
@@ -578,11 +591,12 @@ static unsigned int ti_hecc_mailbox_read(struct can_rx_offload *offload,
 	 */
 	if (unlikely(mbxno == HECC_RX_LAST_MBOX &&
 		     hecc_read(priv, HECC_CANRML) & mbx_mask))
-		ret = -ENOBUFS;
+		skb = ERR_PTR(-ENOBUFS);
 
+ mark_as_read:
 	hecc_write(priv, HECC_CANRMP, mbx_mask);
 
-	return ret;
+	return skb;
 }
 
 static int ti_hecc_error(struct net_device *ndev, int int_status,
diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h
index fc75e9a7ad2f..1b78a0cfb615 100644
--- a/include/linux/can/rx-offload.h
+++ b/include/linux/can/rx-offload.h
@@ -15,9 +15,9 @@
 struct can_rx_offload {
 	struct net_device *dev;
 
-	unsigned int (*mailbox_read)(struct can_rx_offload *offload,
-				     struct can_frame *cf,
-				     u32 *timestamp, unsigned int mb);
+	struct sk_buff *(*mailbox_read)(struct can_rx_offload *offload,
+					unsigned int mb, u32 *timestamp,
+					bool drop);
 
 	struct sk_buff_head skb_queue;
 	u32 skb_queue_len_max;
-- 
cgit v1.2.3-59-g8ed1b


From 6c0867022352027409f5a9fee1d3c6923f9e083e Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 12 Nov 2019 11:35:00 +0000
Subject: net: sfp: fix sfp_bus_add_upstream() warning

When building with SFP disabled, the stub for sfp_bus_add_upstream()
missed "inline".  Add it.

Fixes: 727b3668b730 ("net: sfp: rework upstream interface")
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sfp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index c8464de7cff5..3b35efd85bb1 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -563,8 +563,8 @@ static inline struct sfp_bus *sfp_bus_find_fwnode(struct fwnode_handle *fwnode)
 	return NULL;
 }
 
-static int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
-				const struct sfp_upstream_ops *ops)
+static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
+				       const struct sfp_upstream_ops *ops)
 {
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e2cde864a1d3e3626bfc8fa088fbc82b04ce66ed Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Tue, 12 Nov 2019 14:07:49 +0200
Subject: devlink: Allow large formatted message of binary output

Devlink supports pair output of name and value. When the value is
binary, it must be presented in an array. If the length of the binary
value exceeds fmsg limitation, break the value into chunks internally.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h |  4 +---
 net/core/devlink.c    | 24 +++++++++++++++---------
 2 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 7891611868e4..7e72b2e71164 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -967,8 +967,6 @@ int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value);
 int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value);
 int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value);
 int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value);
-int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
-			    u16 value_len);
 
 int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
 			       bool value);
@@ -981,7 +979,7 @@ int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
 int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
 				 const char *value);
 int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
-				 const void *value, u16 value_len);
+				 const void *value, u32 value_len);
 
 struct devlink_health_reporter *
 devlink_health_reporter_create(struct devlink *devlink,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2e027c9436e0..9bad78388a07 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -4414,12 +4414,11 @@ int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
 }
 EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
 
-int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
-			    u16 value_len)
+static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+				   u16 value_len)
 {
 	return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
 }
-EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
 
 int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
 			       bool value)
@@ -4527,19 +4526,26 @@ int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
 EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put);
 
 int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
-				 const void *value, u16 value_len)
+				 const void *value, u32 value_len)
 {
+	u32 data_size;
+	u32 offset;
 	int err;
 
-	err = devlink_fmsg_pair_nest_start(fmsg, name);
+	err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
 	if (err)
 		return err;
 
-	err = devlink_fmsg_binary_put(fmsg, value, value_len);
-	if (err)
-		return err;
+	for (offset = 0; offset < value_len; offset += data_size) {
+		data_size = value_len - offset;
+		if (data_size > DEVLINK_FMSG_MAX_SIZE)
+			data_size = DEVLINK_FMSG_MAX_SIZE;
+		err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
+		if (err)
+			return err;
+	}
 
-	err = devlink_fmsg_pair_nest_end(fmsg);
+	err = devlink_fmsg_arr_pair_nest_end(fmsg);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3-59-g8ed1b


From e0e2b35b790fefbcff5689984a134cdaa4ce051c Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 12 Nov 2019 15:33:11 +0100
Subject: net/sched: actions: remove unused 'order'

after commit 4097e9d250fb ("net: sched: don't use tc_action->order during
action dump"), 'act->order' is initialized but then it's no more read, so
we can just remove this member of struct tc_action.

CC: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 1 -
 net/sched/act_api.c   | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 0495bdc034d2..71347a90a9d1 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -23,7 +23,6 @@ struct tc_action_ops;
 struct tc_action {
 	const struct tc_action_ops	*ops;
 	__u32				type; /* for backward compat(TCA_OLD_COMPAT) */
-	__u32				order;
 	struct tcf_idrinfo		*idrinfo;
 
 	u32				tcfa_index;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index bda1ba25c59e..7fc1e2c1b656 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1003,7 +1003,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 			err = PTR_ERR(act);
 			goto err;
 		}
-		act->order = i;
 		sz += tcf_action_fill_size(act);
 		/* Start from index 0 */
 		actions[i - 1] = act;
-- 
cgit v1.2.3-59-g8ed1b


From b32d2f341623765f525b1a559aa1758599ed7094 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 12 Nov 2019 00:29:51 +0100
Subject: netfilter: nf_flow_table: move conntrack object to struct
 flow_offload

Simplify this code by storing the pointer to conntrack object in the
flow_offload structure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_flow_table.h |  1 +
 net/netfilter/nf_flow_table_core.c    | 35 +++++++++++------------------------
 2 files changed, 12 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 158514281a75..88c8cd248213 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -72,6 +72,7 @@ struct flow_offload_tuple_rhash {
 
 struct flow_offload {
 	struct flow_offload_tuple_rhash		tuplehash[FLOW_OFFLOAD_DIR_MAX];
+	struct nf_conn				*ct;
 	u32					flags;
 	union {
 		/* Your private driver data here. */
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 128245efe84a..aca40ccbcceb 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -16,7 +16,6 @@
 
 struct flow_offload_entry {
 	struct flow_offload	flow;
-	struct nf_conn		*ct;
 	struct rcu_head		rcu_head;
 };
 
@@ -79,7 +78,7 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
 		goto err_dst_cache_reply;
 
-	entry->ct = ct;
+	flow->ct = ct;
 
 	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
 	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
@@ -158,8 +157,8 @@ void flow_offload_free(struct flow_offload *flow)
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
 	e = container_of(flow, struct flow_offload_entry, flow);
 	if (flow->flags & FLOW_OFFLOAD_DYING)
-		nf_ct_delete(e->ct, 0, 0);
-	nf_ct_put(e->ct);
+		nf_ct_delete(flow->ct, 0, 0);
+	nf_ct_put(flow->ct);
 	kfree_rcu(e, rcu_head);
 }
 EXPORT_SYMBOL_GPL(flow_offload_free);
@@ -232,8 +231,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
 static void flow_offload_del(struct nf_flowtable *flow_table,
 			     struct flow_offload *flow)
 {
-	struct flow_offload_entry *e;
-
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
 			       nf_flow_offload_rhash_params);
@@ -241,25 +238,21 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
 			       nf_flow_offload_rhash_params);
 
-	e = container_of(flow, struct flow_offload_entry, flow);
-	clear_bit(IPS_OFFLOAD_BIT, &e->ct->status);
+	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
 
 	if (nf_flow_has_expired(flow))
-		flow_offload_fixup_ct(e->ct);
+		flow_offload_fixup_ct(flow->ct);
 	else if (flow->flags & FLOW_OFFLOAD_TEARDOWN)
-		flow_offload_fixup_ct_timeout(e->ct);
+		flow_offload_fixup_ct_timeout(flow->ct);
 
 	flow_offload_free(flow);
 }
 
 void flow_offload_teardown(struct flow_offload *flow)
 {
-	struct flow_offload_entry *e;
-
 	flow->flags |= FLOW_OFFLOAD_TEARDOWN;
 
-	e = container_of(flow, struct flow_offload_entry, flow);
-	flow_offload_fixup_ct_state(e->ct);
+	flow_offload_fixup_ct_state(flow->ct);
 }
 EXPORT_SYMBOL_GPL(flow_offload_teardown);
 
@@ -269,7 +262,6 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
 {
 	struct flow_offload_tuple_rhash *tuplehash;
 	struct flow_offload *flow;
-	struct flow_offload_entry *e;
 	int dir;
 
 	tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
@@ -282,8 +274,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
 	if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))
 		return NULL;
 
-	e = container_of(flow, struct flow_offload_entry, flow);
-	if (unlikely(nf_ct_is_dying(e->ct)))
+	if (unlikely(nf_ct_is_dying(flow->ct)))
 		return NULL;
 
 	return tuplehash;
@@ -327,10 +318,8 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
 static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
 {
 	struct nf_flowtable *flow_table = data;
-	struct flow_offload_entry *e;
 
-	e = container_of(flow, struct flow_offload_entry, flow);
-	if (nf_flow_has_expired(flow) || nf_ct_is_dying(e->ct) ||
+	if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) ||
 	    (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)))
 		flow_offload_del(flow_table, flow);
 }
@@ -485,15 +474,13 @@ EXPORT_SYMBOL_GPL(nf_flow_table_init);
 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 {
 	struct net_device *dev = data;
-	struct flow_offload_entry *e;
-
-	e = container_of(flow, struct flow_offload_entry, flow);
 
 	if (!dev) {
 		flow_offload_teardown(flow);
 		return;
 	}
-	if (net_eq(nf_ct_net(e->ct), dev_net(dev)) &&
+
+	if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
 	    (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
 	     flow->tuplehash[1].tuple.iifidx == dev->ifindex))
 		flow_offload_dead(flow);
-- 
cgit v1.2.3-59-g8ed1b


From 9f48e9bf253aa292dbf10f173f6f4c02d0349f45 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 12 Nov 2019 00:29:52 +0100
Subject: netfilter: nf_flow_table: remove union from flow_offload structure

Drivers do not have access to the flow_offload structure, hence remove
this union from this flow_offload object as well as the original comment
on top of it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_flow_table.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 88c8cd248213..7f892d6c1a6d 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -74,10 +74,7 @@ struct flow_offload {
 	struct flow_offload_tuple_rhash		tuplehash[FLOW_OFFLOAD_DIR_MAX];
 	struct nf_conn				*ct;
 	u32					flags;
-	union {
-		/* Your private driver data here. */
-		u32		timeout;
-	};
+	u32					timeout;
 };
 
 #define NF_FLOW_TIMEOUT (30 * HZ)
-- 
cgit v1.2.3-59-g8ed1b


From 62248df88a406a443b838a3633a7f60a716f999e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 12 Nov 2019 00:29:53 +0100
Subject: netfilter: nf_flowtable: remove flow_offload_entry structure

Move rcu_head to struct flow_offload, then remove the flow_offload_entry
structure definition.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_flow_table.h |  1 +
 net/netfilter/nf_flow_table_core.c    | 19 ++++---------------
 2 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 7f892d6c1a6d..6d33734c8fa1 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -75,6 +75,7 @@ struct flow_offload {
 	struct nf_conn				*ct;
 	u32					flags;
 	u32					timeout;
+	struct rcu_head				rcu_head;
 };
 
 #define NF_FLOW_TIMEOUT (30 * HZ)
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index aca40ccbcceb..15a5555940c7 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -14,11 +14,6 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
 
-struct flow_offload_entry {
-	struct flow_offload	flow;
-	struct rcu_head		rcu_head;
-};
-
 static DEFINE_MUTEX(flowtable_lock);
 static LIST_HEAD(flowtables);
 
@@ -59,19 +54,16 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 struct flow_offload *
 flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 {
-	struct flow_offload_entry *entry;
 	struct flow_offload *flow;
 
 	if (unlikely(nf_ct_is_dying(ct) ||
 	    !atomic_inc_not_zero(&ct->ct_general.use)))
 		return NULL;
 
-	entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
-	if (!entry)
+	flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+	if (!flow)
 		goto err_ct_refcnt;
 
-	flow = &entry->flow;
-
 	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
 		goto err_dst_cache_original;
 
@@ -93,7 +85,7 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 err_dst_cache_reply:
 	dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
 err_dst_cache_original:
-	kfree(entry);
+	kfree(flow);
 err_ct_refcnt:
 	nf_ct_put(ct);
 
@@ -151,15 +143,12 @@ static void flow_offload_fixup_ct(struct nf_conn *ct)
 
 void flow_offload_free(struct flow_offload *flow)
 {
-	struct flow_offload_entry *e;
-
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
-	e = container_of(flow, struct flow_offload_entry, flow);
 	if (flow->flags & FLOW_OFFLOAD_DYING)
 		nf_ct_delete(flow->ct, 0, 0);
 	nf_ct_put(flow->ct);
-	kfree_rcu(e, rcu_head);
+	kfree_rcu(flow, rcu_head);
 }
 EXPORT_SYMBOL_GPL(flow_offload_free);
 
-- 
cgit v1.2.3-59-g8ed1b


From f1363e058b84e61d39f9796fa806090ad7a28ebd Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 12 Nov 2019 00:29:54 +0100
Subject: netfilter: nf_flow_table: detach routing information from flow
 description

This patch adds the infrastructure to support for flow entry types.
The initial type is NF_FLOW_OFFLOAD_ROUTE that stores the routing
information into the flow entry to define a fastpath for the classic
forwarding path.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_flow_table.h | 14 ++++--
 net/netfilter/nf_flow_table_core.c    | 88 ++++++++++++++++++++++++++---------
 net/netfilter/nft_flow_offload.c      |  5 +-
 3 files changed, 80 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 6d33734c8fa1..f000e8917487 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -70,10 +70,16 @@ struct flow_offload_tuple_rhash {
 #define FLOW_OFFLOAD_DYING	0x4
 #define FLOW_OFFLOAD_TEARDOWN	0x8
 
+enum flow_offload_type {
+	NF_FLOW_OFFLOAD_UNSPEC	= 0,
+	NF_FLOW_OFFLOAD_ROUTE,
+};
+
 struct flow_offload {
 	struct flow_offload_tuple_rhash		tuplehash[FLOW_OFFLOAD_DIR_MAX];
 	struct nf_conn				*ct;
-	u32					flags;
+	u16					flags;
+	u16					type;
 	u32					timeout;
 	struct rcu_head				rcu_head;
 };
@@ -86,10 +92,12 @@ struct nf_flow_route {
 	} tuple[FLOW_OFFLOAD_DIR_MAX];
 };
 
-struct flow_offload *flow_offload_alloc(struct nf_conn *ct,
-					struct nf_flow_route *route);
+struct flow_offload *flow_offload_alloc(struct nf_conn *ct);
 void flow_offload_free(struct flow_offload *flow);
 
+int flow_offload_route_init(struct flow_offload *flow,
+			    const struct nf_flow_route *route);
+
 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow);
 struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
 						     struct flow_offload_tuple *tuple);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 15a5555940c7..139a5e074743 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -19,13 +19,10 @@ static LIST_HEAD(flowtables);
 
 static void
 flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
-		      struct nf_flow_route *route,
 		      enum flow_offload_tuple_dir dir)
 {
 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
 	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
-	struct dst_entry *other_dst = route->tuple[!dir].dst;
-	struct dst_entry *dst = route->tuple[dir].dst;
 
 	ft->dir = dir;
 
@@ -33,12 +30,10 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 	case NFPROTO_IPV4:
 		ft->src_v4 = ctt->src.u3.in;
 		ft->dst_v4 = ctt->dst.u3.in;
-		ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
 		break;
 	case NFPROTO_IPV6:
 		ft->src_v6 = ctt->src.u3.in6;
 		ft->dst_v6 = ctt->dst.u3.in6;
-		ft->mtu = ip6_dst_mtu_forward(dst);
 		break;
 	}
 
@@ -46,13 +41,9 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 	ft->l4proto = ctt->dst.protonum;
 	ft->src_port = ctt->src.u.tcp.port;
 	ft->dst_port = ctt->dst.u.tcp.port;
-
-	ft->iifidx = other_dst->dev->ifindex;
-	ft->dst_cache = dst;
 }
 
-struct flow_offload *
-flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
+struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
 {
 	struct flow_offload *flow;
 
@@ -64,16 +55,10 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 	if (!flow)
 		goto err_ct_refcnt;
 
-	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
-		goto err_dst_cache_original;
-
-	if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
-		goto err_dst_cache_reply;
-
 	flow->ct = ct;
 
-	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
-	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
+	flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_ORIGINAL);
+	flow_offload_fill_dir(flow, ct, FLOW_OFFLOAD_DIR_REPLY);
 
 	if (ct->status & IPS_SRC_NAT)
 		flow->flags |= FLOW_OFFLOAD_SNAT;
@@ -82,10 +67,6 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 
 	return flow;
 
-err_dst_cache_reply:
-	dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
-err_dst_cache_original:
-	kfree(flow);
 err_ct_refcnt:
 	nf_ct_put(ct);
 
@@ -93,6 +74,56 @@ err_ct_refcnt:
 }
 EXPORT_SYMBOL_GPL(flow_offload_alloc);
 
+static int flow_offload_fill_route(struct flow_offload *flow,
+				   const struct nf_flow_route *route,
+				   enum flow_offload_tuple_dir dir)
+{
+	struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
+	struct dst_entry *other_dst = route->tuple[!dir].dst;
+	struct dst_entry *dst = route->tuple[dir].dst;
+
+	if (!dst_hold_safe(route->tuple[dir].dst))
+		return -1;
+
+	switch (flow_tuple->l3proto) {
+	case NFPROTO_IPV4:
+		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
+		break;
+	case NFPROTO_IPV6:
+		flow_tuple->mtu = ip6_dst_mtu_forward(dst);
+		break;
+	}
+
+	flow_tuple->iifidx = other_dst->dev->ifindex;
+	flow_tuple->dst_cache = dst;
+
+	return 0;
+}
+
+int flow_offload_route_init(struct flow_offload *flow,
+			    const struct nf_flow_route *route)
+{
+	int err;
+
+	err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+	if (err < 0)
+		return err;
+
+	err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
+	if (err < 0)
+		goto err_route_reply;
+
+	flow->type = NF_FLOW_OFFLOAD_ROUTE;
+
+	return 0;
+
+err_route_reply:
+	dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(flow_offload_route_init);
+
 static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
 {
 	tcp->state = TCP_CONNTRACK_ESTABLISHED;
@@ -141,10 +172,21 @@ static void flow_offload_fixup_ct(struct nf_conn *ct)
 	flow_offload_fixup_ct_timeout(ct);
 }
 
-void flow_offload_free(struct flow_offload *flow)
+static void flow_offload_route_release(struct flow_offload *flow)
 {
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+}
+
+void flow_offload_free(struct flow_offload *flow)
+{
+	switch (flow->type) {
+	case NF_FLOW_OFFLOAD_ROUTE:
+		flow_offload_route_release(flow);
+		break;
+	default:
+		break;
+	}
 	if (flow->flags & FLOW_OFFLOAD_DYING)
 		nf_ct_delete(flow->ct, 0, 0);
 	nf_ct_put(flow->ct);
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index f29bbc74c4bf..dd82ff2ee19f 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -115,10 +115,13 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
 	if (nft_flow_route(pkt, ct, &route, dir) < 0)
 		goto err_flow_route;
 
-	flow = flow_offload_alloc(ct, &route);
+	flow = flow_offload_alloc(ct);
 	if (!flow)
 		goto err_flow_alloc;
 
+	if (flow_offload_route_init(flow, &route) < 0)
+		goto err_flow_add;
+
 	if (tcph) {
 		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
 		ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
-- 
cgit v1.2.3-59-g8ed1b


From 8bb69f3b2918788435cbd5834c66682642c09fba Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 12 Nov 2019 00:29:55 +0100
Subject: netfilter: nf_tables: add flowtable offload control plane

This patch adds the NFTA_FLOWTABLE_FLAGS attribute that allows users to
specify the NF_FLOWTABLE_HW_OFFLOAD flag. This patch also adds a new
setup interface for the flowtable type to perform the flowtable offload
block callback configuration.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_flow_table.h    | 18 ++++++++++++++++++
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/ipv4/netfilter/nf_flow_table_ipv4.c  |  1 +
 net/ipv6/netfilter/nf_flow_table_ipv6.c  |  1 +
 net/netfilter/nf_flow_table_inet.c       |  1 +
 net/netfilter/nf_tables_api.c            | 21 +++++++++++++++++++--
 6 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index f000e8917487..ece09d36c7a6 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -8,6 +8,7 @@
 #include <linux/rcupdate.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_conntrack_tuple_common.h>
+#include <net/flow_offload.h>
 #include <net/dst.h>
 
 struct nf_flowtable;
@@ -16,17 +17,27 @@ struct nf_flowtable_type {
 	struct list_head		list;
 	int				family;
 	int				(*init)(struct nf_flowtable *ft);
+	int				(*setup)(struct nf_flowtable *ft,
+						 struct net_device *dev,
+						 enum flow_block_command cmd);
 	void				(*free)(struct nf_flowtable *ft);
 	nf_hookfn			*hook;
 	struct module			*owner;
 };
 
+enum nf_flowtable_flags {
+	NF_FLOWTABLE_HW_OFFLOAD		= 0x1,
+};
+
 struct nf_flowtable {
 	struct list_head		list;
 	struct rhashtable		rhashtable;
 	int				priority;
 	const struct nf_flowtable_type	*type;
 	struct delayed_work		gc_work;
+	unsigned int			flags;
+	struct flow_block		flow_block;
+	possible_net_t			net;
 };
 
 enum flow_offload_tuple_dir {
@@ -131,4 +142,11 @@ unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 #define MODULE_ALIAS_NF_FLOWTABLE(family)	\
 	MODULE_ALIAS("nf-flowtable-" __stringify(family))
 
+static inline int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
+					      struct net_device *dev,
+					      enum flow_block_command cmd)
+{
+	return 0;
+}
+
 #endif /* _NF_FLOW_TABLE_H */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 81fed16fe2b2..bb9b049310df 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1518,6 +1518,7 @@ enum nft_object_attributes {
  * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32)
  * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32)
  * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64)
+ * @NFTA_FLOWTABLE_FLAGS: flags (NLA_U32)
  */
 enum nft_flowtable_attributes {
 	NFTA_FLOWTABLE_UNSPEC,
@@ -1527,6 +1528,7 @@ enum nft_flowtable_attributes {
 	NFTA_FLOWTABLE_USE,
 	NFTA_FLOWTABLE_HANDLE,
 	NFTA_FLOWTABLE_PAD,
+	NFTA_FLOWTABLE_FLAGS,
 	__NFTA_FLOWTABLE_MAX
 };
 #define NFTA_FLOWTABLE_MAX	(__NFTA_FLOWTABLE_MAX - 1)
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 012c4047c788..f3befddb5fdd 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -9,6 +9,7 @@
 static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.init		= nf_flow_table_init,
+	.setup		= nf_flow_table_offload_setup,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ip_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index f6d9a48c7a2a..1c47f05eabd6 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -10,6 +10,7 @@
 static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.init		= nf_flow_table_init,
+	.setup		= nf_flow_table_offload_setup,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ipv6_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 593357aedb36..1e70fd504da3 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -24,6 +24,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
 static struct nf_flowtable_type flowtable_inet = {
 	.family		= NFPROTO_INET,
 	.init		= nf_flow_table_init,
+	.setup		= nf_flow_table_offload_setup,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_inet_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0d2243945f1d..2dc636faa322 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5835,6 +5835,7 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
 					    .len = NFT_NAME_MAXLEN - 1 },
 	[NFTA_FLOWTABLE_HOOK]		= { .type = NLA_NESTED },
 	[NFTA_FLOWTABLE_HANDLE]		= { .type = NLA_U64 },
+	[NFTA_FLOWTABLE_FLAGS]		= { .type = NLA_U32 },
 };
 
 struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table,
@@ -5968,8 +5969,11 @@ static void nft_unregister_flowtable_net_hooks(struct net *net,
 {
 	struct nft_hook *hook;
 
-	list_for_each_entry(hook, &flowtable->hook_list, list)
+	list_for_each_entry(hook, &flowtable->hook_list, list) {
 		nf_unregister_net_hook(net, &hook->ops);
+		flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+					    FLOW_BLOCK_UNBIND);
+	}
 }
 
 static int nft_register_flowtable_net_hooks(struct net *net,
@@ -5991,6 +5995,8 @@ static int nft_register_flowtable_net_hooks(struct net *net,
 			}
 		}
 
+		flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+					    FLOW_BLOCK_BIND);
 		err = nf_register_net_hook(net, &hook->ops);
 		if (err < 0)
 			goto err_unregister_net_hooks;
@@ -6006,6 +6012,8 @@ err_unregister_net_hooks:
 			break;
 
 		nf_unregister_net_hook(net, &hook->ops);
+		flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
+					    FLOW_BLOCK_UNBIND);
 		list_del_rcu(&hook->list);
 		kfree_rcu(hook, rcu);
 	}
@@ -6080,6 +6088,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 		goto err2;
 	}
 
+	if (nla[NFTA_FLOWTABLE_FLAGS]) {
+		flowtable->data.flags =
+			ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+		if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD)
+			goto err3;
+	}
+
+	write_pnet(&flowtable->data.net, net);
 	flowtable->data.type = type;
 	err = type->init(&flowtable->data);
 	if (err < 0)
@@ -6191,7 +6207,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
 	    nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
 	    nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
 	    nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
-			 NFTA_FLOWTABLE_PAD))
+			 NFTA_FLOWTABLE_PAD) ||
+	    nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
 		goto nla_put_failure;
 
 	nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK);
-- 
cgit v1.2.3-59-g8ed1b


From c29f74e0df7a02b8303bcdce93a7c0132d62577a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 12 Nov 2019 00:29:56 +0100
Subject: netfilter: nf_flow_table: hardware offload support

This patch adds the dataplane hardware offload to the flowtable
infrastructure. Three new flags represent the hardware state of this
flow:

* FLOW_OFFLOAD_HW: This flow entry resides in the hardware.
* FLOW_OFFLOAD_HW_DYING: This flow entry has been scheduled to be remove
  from hardware. This might be triggered by either packet path (via TCP
  RST/FIN packet) or via aging.
* FLOW_OFFLOAD_HW_DEAD: This flow entry has been already removed from
  the hardware, the software garbage collector can remove it from the
  software flowtable.

This patch supports for:

* IPv4 only.
* Aging via FLOW_CLS_STATS, no packet and byte counter synchronization
  at this stage.

This patch also adds the action callback that specifies how to convert
the flow entry into the flow_rule object that is passed to the driver.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h               |   1 +
 include/net/netfilter/nf_flow_table.h   |  33 +-
 net/ipv4/netfilter/nf_flow_table_ipv4.c |   1 +
 net/ipv6/netfilter/nf_flow_table_ipv6.c |   1 +
 net/netfilter/Makefile                  |   3 +-
 net/netfilter/nf_flow_table_core.c      |  33 +-
 net/netfilter/nf_flow_table_inet.c      |   1 +
 net/netfilter/nf_flow_table_offload.c   | 758 ++++++++++++++++++++++++++++++++
 8 files changed, 822 insertions(+), 9 deletions(-)
 create mode 100644 net/netfilter/nf_flow_table_offload.c

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f857f01234f7..9e6fb8524d91 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -848,6 +848,7 @@ enum tc_setup_type {
 	TC_SETUP_ROOT_QDISC,
 	TC_SETUP_QDISC_GRED,
 	TC_SETUP_QDISC_TAPRIO,
+	TC_SETUP_FT,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index ece09d36c7a6..eea66de328d3 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -12,6 +12,9 @@
 #include <net/dst.h>
 
 struct nf_flowtable;
+struct nf_flow_rule;
+struct flow_offload;
+enum flow_offload_tuple_dir;
 
 struct nf_flowtable_type {
 	struct list_head		list;
@@ -20,6 +23,10 @@ struct nf_flowtable_type {
 	int				(*setup)(struct nf_flowtable *ft,
 						 struct net_device *dev,
 						 enum flow_block_command cmd);
+	int				(*action)(struct net *net,
+						  const struct flow_offload *flow,
+						  enum flow_offload_tuple_dir dir,
+						  struct nf_flow_rule *flow_rule);
 	void				(*free)(struct nf_flowtable *ft);
 	nf_hookfn			*hook;
 	struct module			*owner;
@@ -80,6 +87,9 @@ struct flow_offload_tuple_rhash {
 #define FLOW_OFFLOAD_DNAT	0x2
 #define FLOW_OFFLOAD_DYING	0x4
 #define FLOW_OFFLOAD_TEARDOWN	0x8
+#define FLOW_OFFLOAD_HW		0x10
+#define FLOW_OFFLOAD_HW_DYING	0x20
+#define FLOW_OFFLOAD_HW_DEAD	0x40
 
 enum flow_offload_type {
 	NF_FLOW_OFFLOAD_UNSPEC	= 0,
@@ -142,11 +152,22 @@ unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 #define MODULE_ALIAS_NF_FLOWTABLE(family)	\
 	MODULE_ALIAS("nf-flowtable-" __stringify(family))
 
-static inline int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
-					      struct net_device *dev,
-					      enum flow_block_command cmd)
-{
-	return 0;
-}
+void nf_flow_offload_add(struct nf_flowtable *flowtable,
+			 struct flow_offload *flow);
+void nf_flow_offload_del(struct nf_flowtable *flowtable,
+			 struct flow_offload *flow);
+void nf_flow_offload_stats(struct nf_flowtable *flowtable,
+			   struct flow_offload *flow);
+
+void nf_flow_table_offload_flush(struct nf_flowtable *flowtable);
+int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
+				struct net_device *dev,
+				enum flow_block_command cmd);
+int nf_flow_rule_route(struct net *net, const struct flow_offload *flow,
+		       enum flow_offload_tuple_dir dir,
+		       struct nf_flow_rule *flow_rule);
+
+int nf_flow_table_offload_init(void);
+void nf_flow_table_offload_exit(void);
 
 #endif /* _NF_FLOW_TABLE_H */
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index f3befddb5fdd..168b72e18be0 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -10,6 +10,7 @@ static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.init		= nf_flow_table_init,
 	.setup		= nf_flow_table_offload_setup,
+	.action		= nf_flow_rule_route,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ip_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index 1c47f05eabd6..f069bc0dc056 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -11,6 +11,7 @@ static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.init		= nf_flow_table_init,
 	.setup		= nf_flow_table_offload_setup,
+	.action		= nf_flow_rule_route,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ipv6_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 4fc075b612fe..5e9b2eb24349 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -120,7 +120,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 
 # flow table infrastructure
 obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
-nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
+nf_flow_table-objs		:= nf_flow_table_core.o nf_flow_table_ip.o \
+				   nf_flow_table_offload.o
 
 obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
 
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 139a5e074743..8468d2d02284 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -250,6 +250,9 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
 		return err;
 	}
 
+	if (flow_table->flags & NF_FLOWTABLE_HW_OFFLOAD)
+		nf_flow_offload_add(flow_table, flow);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(flow_offload_add);
@@ -350,9 +353,20 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
 {
 	struct nf_flowtable *flow_table = data;
 
+	if (flow->flags & FLOW_OFFLOAD_HW)
+		nf_flow_offload_stats(flow_table, flow);
+
 	if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) ||
-	    (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)))
-		flow_offload_del(flow_table, flow);
+	    (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) {
+		if (flow->flags & FLOW_OFFLOAD_HW) {
+			if (!(flow->flags & FLOW_OFFLOAD_HW_DYING))
+				nf_flow_offload_del(flow_table, flow);
+			else if (flow->flags & FLOW_OFFLOAD_HW_DEAD)
+				flow_offload_del(flow_table, flow);
+		} else {
+			flow_offload_del(flow_table, flow);
+		}
+	}
 }
 
 static void nf_flow_offload_work_gc(struct work_struct *work)
@@ -485,6 +499,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
 	int err;
 
 	INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+	flow_block_init(&flowtable->flow_block);
 
 	err = rhashtable_init(&flowtable->rhashtable,
 			      &nf_flow_offload_rhash_params);
@@ -520,6 +535,7 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
 					  struct net_device *dev)
 {
+	nf_flow_table_offload_flush(flowtable);
 	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
 	flush_delayed_work(&flowtable->gc_work);
 }
@@ -547,5 +563,18 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_free);
 
+static int __init nf_flow_table_module_init(void)
+{
+	return nf_flow_table_offload_init();
+}
+
+static void __exit nf_flow_table_module_exit(void)
+{
+	nf_flow_table_offload_exit();
+}
+
+module_init(nf_flow_table_module_init);
+module_exit(nf_flow_table_module_exit);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 1e70fd504da3..bfb910b874ce 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -25,6 +25,7 @@ static struct nf_flowtable_type flowtable_inet = {
 	.family		= NFPROTO_INET,
 	.init		= nf_flow_table_init,
 	.setup		= nf_flow_table_offload_setup,
+	.action		= nf_flow_rule_route,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_inet_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
new file mode 100644
index 000000000000..9be61f47303a
--- /dev/null
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -0,0 +1,758 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <linux/tc_act/tc_csum.h>
+#include <net/flow_offload.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+static struct work_struct nf_flow_offload_work;
+static DEFINE_SPINLOCK(flow_offload_pending_list_lock);
+static LIST_HEAD(flow_offload_pending_list);
+
+struct flow_offload_work {
+	struct list_head	list;
+	enum flow_cls_command	cmd;
+	int			priority;
+	struct nf_flowtable	*flowtable;
+	struct flow_offload	*flow;
+};
+
+struct nf_flow_key {
+	struct flow_dissector_key_control		control;
+	struct flow_dissector_key_basic			basic;
+	union {
+		struct flow_dissector_key_ipv4_addrs	ipv4;
+	};
+	struct flow_dissector_key_tcp			tcp;
+	struct flow_dissector_key_ports			tp;
+} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
+
+struct nf_flow_match {
+	struct flow_dissector	dissector;
+	struct nf_flow_key	key;
+	struct nf_flow_key	mask;
+};
+
+struct nf_flow_rule {
+	struct nf_flow_match	match;
+	struct flow_rule	*rule;
+};
+
+#define NF_FLOW_DISSECTOR(__match, __type, __field)	\
+	(__match)->dissector.offset[__type] =		\
+		offsetof(struct nf_flow_key, __field)
+
+static int nf_flow_rule_match(struct nf_flow_match *match,
+			      const struct flow_offload_tuple *tuple)
+{
+	struct nf_flow_key *mask = &match->mask;
+	struct nf_flow_key *key = &match->key;
+
+	NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control);
+	NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic);
+	NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
+	NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp);
+	NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp);
+
+	switch (tuple->l3proto) {
+	case AF_INET:
+		key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+		key->basic.n_proto = htons(ETH_P_IP);
+		key->ipv4.src = tuple->src_v4.s_addr;
+		mask->ipv4.src = 0xffffffff;
+		key->ipv4.dst = tuple->dst_v4.s_addr;
+		mask->ipv4.dst = 0xffffffff;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+	mask->basic.n_proto = 0xffff;
+
+	switch (tuple->l4proto) {
+	case IPPROTO_TCP:
+		key->tcp.flags = 0;
+		mask->tcp.flags = TCP_FLAG_RST | TCP_FLAG_FIN;
+		match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_TCP);
+		break;
+	case IPPROTO_UDP:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	key->basic.ip_proto = tuple->l4proto;
+	mask->basic.ip_proto = 0xff;
+
+	key->tp.src = tuple->src_port;
+	mask->tp.src = 0xffff;
+	key->tp.dst = tuple->dst_port;
+	mask->tp.dst = 0xffff;
+
+	match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL) |
+				      BIT(FLOW_DISSECTOR_KEY_BASIC) |
+				      BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) |
+				      BIT(FLOW_DISSECTOR_KEY_PORTS);
+	return 0;
+}
+
+static void flow_offload_mangle(struct flow_action_entry *entry,
+				enum flow_action_mangle_base htype,
+				u32 offset, u8 *value, u8 *mask)
+{
+	entry->id = FLOW_ACTION_MANGLE;
+	entry->mangle.htype = htype;
+	entry->mangle.offset = offset;
+	memcpy(&entry->mangle.mask, mask, sizeof(u32));
+	memcpy(&entry->mangle.val, value, sizeof(u32));
+}
+
+static int flow_offload_eth_src(struct net *net,
+				const struct flow_offload *flow,
+				enum flow_offload_tuple_dir dir,
+				struct flow_action_entry *entry0,
+				struct flow_action_entry *entry1)
+{
+	const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple;
+	struct net_device *dev;
+	u32 mask, val;
+	u16 val16;
+
+	dev = dev_get_by_index(net, tuple->iifidx);
+	if (!dev)
+		return -ENOENT;
+
+	mask = ~0xffff0000;
+	memcpy(&val16, dev->dev_addr, 2);
+	val = val16 << 16;
+	flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
+			    (u8 *)&val, (u8 *)&mask);
+
+	mask = ~0xffffffff;
+	memcpy(&val, dev->dev_addr + 2, 4);
+	flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8,
+			    (u8 *)&val, (u8 *)&mask);
+	dev_put(dev);
+
+	return 0;
+}
+
+static int flow_offload_eth_dst(struct net *net,
+				const struct flow_offload *flow,
+				enum flow_offload_tuple_dir dir,
+				struct flow_action_entry *entry0,
+				struct flow_action_entry *entry1)
+{
+	const struct flow_offload_tuple *tuple = &flow->tuplehash[dir].tuple;
+	struct neighbour *n;
+	u32 mask, val;
+	u16 val16;
+
+	n = dst_neigh_lookup(tuple->dst_cache, &tuple->dst_v4);
+	if (!n)
+		return -ENOENT;
+
+	mask = ~0xffffffff;
+	memcpy(&val, n->ha, 4);
+	flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 0,
+			    (u8 *)&val, (u8 *)&mask);
+
+	mask = ~0x0000ffff;
+	memcpy(&val16, n->ha + 4, 2);
+	val = val16;
+	flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
+			    (u8 *)&val, (u8 *)&mask);
+	neigh_release(n);
+
+	return 0;
+}
+
+static void flow_offload_ipv4_snat(struct net *net,
+				   const struct flow_offload *flow,
+				   enum flow_offload_tuple_dir dir,
+				   struct flow_action_entry *entry)
+{
+	u32 mask = ~htonl(0xffffffff);
+	__be32 addr;
+	u32 offset;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+		offset = offsetof(struct iphdr, saddr);
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+		offset = offsetof(struct iphdr, daddr);
+		break;
+	default:
+		return;
+	}
+
+	flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset,
+			    (u8 *)&addr, (u8 *)&mask);
+}
+
+static void flow_offload_ipv4_dnat(struct net *net,
+				   const struct flow_offload *flow,
+				   enum flow_offload_tuple_dir dir,
+				   struct flow_action_entry *entry)
+{
+	u32 mask = ~htonl(0xffffffff);
+	__be32 addr;
+	u32 offset;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+		offset = offsetof(struct iphdr, daddr);
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+		offset = offsetof(struct iphdr, saddr);
+		break;
+	default:
+		return;
+	}
+
+	flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP4, offset,
+			    (u8 *)&addr, (u8 *)&mask);
+}
+
+static int flow_offload_l4proto(const struct flow_offload *flow)
+{
+	u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
+	u8 type = 0;
+
+	switch (protonum) {
+	case IPPROTO_TCP:
+		type = FLOW_ACT_MANGLE_HDR_TYPE_TCP;
+		break;
+	case IPPROTO_UDP:
+		type = FLOW_ACT_MANGLE_HDR_TYPE_UDP;
+		break;
+	default:
+		break;
+	}
+
+	return type;
+}
+
+static void flow_offload_port_snat(struct net *net,
+				   const struct flow_offload *flow,
+				   enum flow_offload_tuple_dir dir,
+				   struct flow_action_entry *entry)
+{
+	u32 mask = ~htonl(0xffff0000);
+	__be16 port;
+	u32 offset;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
+		offset = 0; /* offsetof(struct tcphdr, source); */
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
+		offset = 0; /* offsetof(struct tcphdr, dest); */
+		break;
+	default:
+		break;
+	}
+
+	flow_offload_mangle(entry, flow_offload_l4proto(flow), offset,
+			    (u8 *)&port, (u8 *)&mask);
+}
+
+static void flow_offload_port_dnat(struct net *net,
+				   const struct flow_offload *flow,
+				   enum flow_offload_tuple_dir dir,
+				   struct flow_action_entry *entry)
+{
+	u32 mask = ~htonl(0xffff);
+	__be16 port;
+	u32 offset;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
+		offset = 0; /* offsetof(struct tcphdr, source); */
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
+		offset = 0; /* offsetof(struct tcphdr, dest); */
+		break;
+	default:
+		break;
+	}
+
+	flow_offload_mangle(entry, flow_offload_l4proto(flow), offset,
+			    (u8 *)&port, (u8 *)&mask);
+}
+
+static void flow_offload_ipv4_checksum(struct net *net,
+				       const struct flow_offload *flow,
+				       struct flow_action_entry *entry)
+{
+	u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
+
+	entry->id = FLOW_ACTION_CSUM;
+	entry->csum_flags = TCA_CSUM_UPDATE_FLAG_IPV4HDR;
+
+	switch (protonum) {
+	case IPPROTO_TCP:
+		entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_TCP;
+		break;
+	case IPPROTO_UDP:
+		entry->csum_flags |= TCA_CSUM_UPDATE_FLAG_UDP;
+		break;
+	}
+}
+
+static void flow_offload_redirect(const struct flow_offload *flow,
+				  enum flow_offload_tuple_dir dir,
+				  struct flow_action_entry *entry)
+{
+	struct rtable *rt;
+
+	rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+	entry->id = FLOW_ACTION_REDIRECT;
+	entry->dev = rt->dst.dev;
+	dev_hold(rt->dst.dev);
+}
+
+int nf_flow_rule_route(struct net *net, const struct flow_offload *flow,
+		       enum flow_offload_tuple_dir dir,
+		       struct nf_flow_rule *flow_rule)
+{
+	int i;
+
+	if (flow_offload_eth_src(net, flow, dir,
+				 &flow_rule->rule->action.entries[0],
+				 &flow_rule->rule->action.entries[1]) < 0)
+		return -1;
+
+	if (flow_offload_eth_dst(net, flow, dir,
+				 &flow_rule->rule->action.entries[2],
+				 &flow_rule->rule->action.entries[3]) < 0)
+		return -1;
+
+	i = 4;
+	if (flow->flags & FLOW_OFFLOAD_SNAT) {
+		flow_offload_ipv4_snat(net, flow, dir,
+				       &flow_rule->rule->action.entries[i++]);
+		flow_offload_port_snat(net, flow, dir,
+				       &flow_rule->rule->action.entries[i++]);
+	}
+	if (flow->flags & FLOW_OFFLOAD_DNAT) {
+		flow_offload_ipv4_dnat(net, flow, dir,
+				       &flow_rule->rule->action.entries[i++]);
+		flow_offload_port_dnat(net, flow, dir,
+				       &flow_rule->rule->action.entries[i++]);
+	}
+	if (flow->flags & FLOW_OFFLOAD_SNAT ||
+	    flow->flags & FLOW_OFFLOAD_DNAT)
+		flow_offload_ipv4_checksum(net, flow,
+					   &flow_rule->rule->action.entries[i++]);
+
+	flow_offload_redirect(flow, dir, &flow_rule->rule->action.entries[i++]);
+
+	return i;
+}
+EXPORT_SYMBOL_GPL(nf_flow_rule_route);
+
+static struct nf_flow_rule *
+nf_flow_offload_rule_alloc(struct net *net,
+			   const struct flow_offload_work *offload,
+			   enum flow_offload_tuple_dir dir)
+{
+	const struct nf_flowtable *flowtable = offload->flowtable;
+	const struct flow_offload *flow = offload->flow;
+	const struct flow_offload_tuple *tuple;
+	struct nf_flow_rule *flow_rule;
+	int err = -ENOMEM, num_actions;
+
+	flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL);
+	if (!flow_rule)
+		goto err_flow;
+
+	flow_rule->rule = flow_rule_alloc(10);
+	if (!flow_rule->rule)
+		goto err_flow_rule;
+
+	flow_rule->rule->match.dissector = &flow_rule->match.dissector;
+	flow_rule->rule->match.mask = &flow_rule->match.mask;
+	flow_rule->rule->match.key = &flow_rule->match.key;
+
+	tuple = &flow->tuplehash[dir].tuple;
+	err = nf_flow_rule_match(&flow_rule->match, tuple);
+	if (err < 0)
+		goto err_flow_match;
+
+	num_actions = flowtable->type->action(net, flow, dir, flow_rule);
+	if (num_actions < 0)
+		goto err_flow_match;
+
+	flow_rule->rule->action.num_entries = num_actions;
+
+	return flow_rule;
+
+err_flow_match:
+	kfree(flow_rule->rule);
+err_flow_rule:
+	kfree(flow_rule);
+err_flow:
+	return NULL;
+}
+
+static void __nf_flow_offload_destroy(struct nf_flow_rule *flow_rule)
+{
+	struct flow_action_entry *entry;
+	int i;
+
+	for (i = 0; i < flow_rule->rule->action.num_entries; i++) {
+		entry = &flow_rule->rule->action.entries[i];
+		if (entry->id != FLOW_ACTION_REDIRECT)
+			continue;
+
+		dev_put(entry->dev);
+	}
+	kfree(flow_rule->rule);
+	kfree(flow_rule);
+}
+
+static void nf_flow_offload_destroy(struct nf_flow_rule *flow_rule[])
+{
+	int i;
+
+	for (i = 0; i < FLOW_OFFLOAD_DIR_MAX; i++)
+		__nf_flow_offload_destroy(flow_rule[i]);
+}
+
+static int nf_flow_offload_alloc(const struct flow_offload_work *offload,
+				 struct nf_flow_rule *flow_rule[])
+{
+	struct net *net = read_pnet(&offload->flowtable->net);
+
+	flow_rule[0] = nf_flow_offload_rule_alloc(net, offload,
+						  FLOW_OFFLOAD_DIR_ORIGINAL);
+	if (!flow_rule[0])
+		return -ENOMEM;
+
+	flow_rule[1] = nf_flow_offload_rule_alloc(net, offload,
+						  FLOW_OFFLOAD_DIR_REPLY);
+	if (!flow_rule[1]) {
+		__nf_flow_offload_destroy(flow_rule[0]);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void nf_flow_offload_init(struct flow_cls_offload *cls_flow,
+				 __be16 proto, int priority,
+				 enum flow_cls_command cmd,
+				 const struct flow_offload_tuple *tuple,
+				 struct netlink_ext_ack *extack)
+{
+	cls_flow->common.protocol = proto;
+	cls_flow->common.prio = priority;
+	cls_flow->common.extack = extack;
+	cls_flow->command = cmd;
+	cls_flow->cookie = (unsigned long)tuple;
+}
+
+static int flow_offload_tuple_add(struct flow_offload_work *offload,
+				  struct nf_flow_rule *flow_rule,
+				  enum flow_offload_tuple_dir dir)
+{
+	struct nf_flowtable *flowtable = offload->flowtable;
+	struct flow_cls_offload cls_flow = {};
+	struct flow_block_cb *block_cb;
+	struct netlink_ext_ack extack;
+	__be16 proto = ETH_P_ALL;
+	int err, i = 0;
+
+	nf_flow_offload_init(&cls_flow, proto, offload->priority,
+			     FLOW_CLS_REPLACE,
+			     &offload->flow->tuplehash[dir].tuple, &extack);
+	cls_flow.rule = flow_rule->rule;
+
+	list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) {
+		err = block_cb->cb(TC_SETUP_FT, &cls_flow,
+				   block_cb->cb_priv);
+		if (err < 0)
+			continue;
+
+		i++;
+	}
+
+	return i;
+}
+
+static void flow_offload_tuple_del(struct flow_offload_work *offload,
+				   enum flow_offload_tuple_dir dir)
+{
+	struct nf_flowtable *flowtable = offload->flowtable;
+	struct flow_cls_offload cls_flow = {};
+	struct flow_block_cb *block_cb;
+	struct netlink_ext_ack extack;
+	__be16 proto = ETH_P_ALL;
+
+	nf_flow_offload_init(&cls_flow, proto, offload->priority,
+			     FLOW_CLS_DESTROY,
+			     &offload->flow->tuplehash[dir].tuple, &extack);
+
+	list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list)
+		block_cb->cb(TC_SETUP_FT, &cls_flow, block_cb->cb_priv);
+
+	offload->flow->flags |= FLOW_OFFLOAD_HW_DEAD;
+}
+
+static int flow_offload_rule_add(struct flow_offload_work *offload,
+				 struct nf_flow_rule *flow_rule[])
+{
+	int ok_count = 0;
+
+	ok_count += flow_offload_tuple_add(offload, flow_rule[0],
+					   FLOW_OFFLOAD_DIR_ORIGINAL);
+	ok_count += flow_offload_tuple_add(offload, flow_rule[1],
+					   FLOW_OFFLOAD_DIR_REPLY);
+	if (ok_count == 0)
+		return -ENOENT;
+
+	return 0;
+}
+
+static int flow_offload_work_add(struct flow_offload_work *offload)
+{
+	struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX];
+	int err;
+
+	err = nf_flow_offload_alloc(offload, flow_rule);
+	if (err < 0)
+		return -ENOMEM;
+
+	err = flow_offload_rule_add(offload, flow_rule);
+
+	nf_flow_offload_destroy(flow_rule);
+
+	return err;
+}
+
+static void flow_offload_work_del(struct flow_offload_work *offload)
+{
+	flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
+	flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
+}
+
+static void flow_offload_tuple_stats(struct flow_offload_work *offload,
+				     enum flow_offload_tuple_dir dir,
+				     struct flow_stats *stats)
+{
+	struct nf_flowtable *flowtable = offload->flowtable;
+	struct flow_cls_offload cls_flow = {};
+	struct flow_block_cb *block_cb;
+	struct netlink_ext_ack extack;
+	__be16 proto = ETH_P_ALL;
+
+	nf_flow_offload_init(&cls_flow, proto, offload->priority,
+			     FLOW_CLS_STATS,
+			     &offload->flow->tuplehash[dir].tuple, &extack);
+
+	list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list)
+		block_cb->cb(TC_SETUP_FT, &cls_flow, block_cb->cb_priv);
+	memcpy(stats, &cls_flow.stats, sizeof(*stats));
+}
+
+static void flow_offload_work_stats(struct flow_offload_work *offload)
+{
+	struct flow_stats stats[FLOW_OFFLOAD_DIR_MAX] = {};
+	u64 lastused;
+
+	flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]);
+	flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]);
+
+	lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
+	offload->flow->timeout = max_t(u64, offload->flow->timeout,
+				       lastused + NF_FLOW_TIMEOUT);
+}
+
+static void flow_offload_work_handler(struct work_struct *work)
+{
+	struct flow_offload_work *offload, *next;
+	LIST_HEAD(offload_pending_list);
+	int ret;
+
+	spin_lock_bh(&flow_offload_pending_list_lock);
+	list_replace_init(&flow_offload_pending_list, &offload_pending_list);
+	spin_unlock_bh(&flow_offload_pending_list_lock);
+
+	list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
+		switch (offload->cmd) {
+		case FLOW_CLS_REPLACE:
+			ret = flow_offload_work_add(offload);
+			if (ret < 0)
+				offload->flow->flags &= ~FLOW_OFFLOAD_HW;
+			break;
+		case FLOW_CLS_DESTROY:
+			flow_offload_work_del(offload);
+			break;
+		case FLOW_CLS_STATS:
+			flow_offload_work_stats(offload);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+		}
+		list_del(&offload->list);
+		kfree(offload);
+	}
+}
+
+static void flow_offload_queue_work(struct flow_offload_work *offload)
+{
+	spin_lock_bh(&flow_offload_pending_list_lock);
+	list_add_tail(&offload->list, &flow_offload_pending_list);
+	spin_unlock_bh(&flow_offload_pending_list_lock);
+
+	schedule_work(&nf_flow_offload_work);
+}
+
+void nf_flow_offload_add(struct nf_flowtable *flowtable,
+			 struct flow_offload *flow)
+{
+	struct flow_offload_work *offload;
+
+	offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC);
+	if (!offload)
+		return;
+
+	offload->cmd = FLOW_CLS_REPLACE;
+	offload->flow = flow;
+	offload->priority = flowtable->priority;
+	offload->flowtable = flowtable;
+	flow->flags |= FLOW_OFFLOAD_HW;
+
+	flow_offload_queue_work(offload);
+}
+
+void nf_flow_offload_del(struct nf_flowtable *flowtable,
+			 struct flow_offload *flow)
+{
+	struct flow_offload_work *offload;
+
+	offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC);
+	if (!offload)
+		return;
+
+	offload->cmd = FLOW_CLS_DESTROY;
+	offload->flow = flow;
+	offload->flow->flags |= FLOW_OFFLOAD_HW_DYING;
+	offload->flowtable = flowtable;
+
+	flow_offload_queue_work(offload);
+}
+
+void nf_flow_offload_stats(struct nf_flowtable *flowtable,
+			   struct flow_offload *flow)
+{
+	struct flow_offload_work *offload;
+	s64 delta;
+
+	delta = flow->timeout - jiffies;
+	if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10) ||
+	    flow->flags & FLOW_OFFLOAD_HW_DYING)
+		return;
+
+	offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC);
+	if (!offload)
+		return;
+
+	offload->cmd = FLOW_CLS_STATS;
+	offload->flow = flow;
+	offload->flowtable = flowtable;
+
+	flow_offload_queue_work(offload);
+}
+
+void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
+{
+	if (flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)
+		flush_work(&nf_flow_offload_work);
+}
+
+static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
+				     struct flow_block_offload *bo,
+				     enum flow_block_command cmd)
+{
+	struct flow_block_cb *block_cb, *next;
+	int err = 0;
+
+	switch (cmd) {
+	case FLOW_BLOCK_BIND:
+		list_splice(&bo->cb_list, &flowtable->flow_block.cb_list);
+		break;
+	case FLOW_BLOCK_UNBIND:
+		list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
+			list_del(&block_cb->list);
+			flow_block_cb_free(block_cb);
+		}
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		err = -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
+				struct net_device *dev,
+				enum flow_block_command cmd)
+{
+	struct netlink_ext_ack extack = {};
+	struct flow_block_offload bo = {};
+	int err;
+
+	if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD))
+		return 0;
+
+	bo.net		= dev_net(dev);
+	bo.block	= &flowtable->flow_block;
+	bo.command	= cmd;
+	bo.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	bo.extack	= &extack;
+	INIT_LIST_HEAD(&bo.cb_list);
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+	if (err < 0)
+		return err;
+
+	return nf_flow_table_block_setup(flowtable, &bo, cmd);
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup);
+
+int nf_flow_table_offload_init(void)
+{
+	INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler);
+
+	return 0;
+}
+
+void nf_flow_table_offload_exit(void)
+{
+	struct flow_offload_work *offload, *next;
+	LIST_HEAD(offload_pending_list);
+
+	cancel_work_sync(&nf_flow_offload_work);
+
+	list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
+		list_del(&offload->list);
+		kfree(offload);
+	}
+}
-- 
cgit v1.2.3-59-g8ed1b


From 25da5eb32cd51383f6dca7aad252376f1979c075 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Oct 2019 16:02:50 +0100
Subject: netfilter: nft_meta: offload support for interface index

This patch adds support for offloading the NFT_META_IIF selector.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_offload.h | 1 +
 net/netfilter/nft_meta.c                  | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h
index 03cf5856d76f..ea7d1d78b92d 100644
--- a/include/net/netfilter/nf_tables_offload.h
+++ b/include/net/netfilter/nf_tables_offload.h
@@ -45,6 +45,7 @@ struct nft_flow_key {
 	struct flow_dissector_key_ip			ip;
 	struct flow_dissector_key_vlan			vlan;
 	struct flow_dissector_key_eth_addrs		eth_addrs;
+	struct flow_dissector_key_meta			meta;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct nft_flow_match {
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 317e3a9e8c5b..8fd21f436347 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -547,6 +547,10 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
 				  sizeof(__u8), reg);
 		nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT);
 		break;
+	case NFT_META_IIF:
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
+				  ingress_ifindex, sizeof(__u32), reg);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 975b992fdd4b38028d7c1dcf38286d6e7991c1b2 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Tue, 12 Nov 2019 00:34:29 +0100
Subject: net/mlx5: Add new chain for netfilter flow table offload

Netfilter tables (nftables) implements a software datapath that
comes after tc ingress datapath. The datapath supports offloading
such rules via the flow table offload API.

This API is currently only used by NFT and it doesn't provide the
global priority in regards to tc offload, so we assume offloading such
rules must come after tc. It does provide a flow table priority
parameter, so we need to provide some supported priority range.

For that, split fastpath prio to two, flow table offload and tc offload,
with one dedicated priority chain for flow table offload.

Next patch will re-use the multi chain API to access this chain by
allowing access to this chain by the fdb_sub_namespace.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 9 +++++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 9 ++++++---
 include/linux/mlx5/fs.h                           | 3 ++-
 3 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 8c9d8dc85861..2b563700c664 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -44,7 +44,12 @@
 #include "lib/mpfs.h"
 
 #define FDB_TC_MAX_CHAIN 3
-#define FDB_TC_SLOW_PATH_CHAIN (FDB_TC_MAX_CHAIN + 1)
+#define FDB_FT_CHAIN (FDB_TC_MAX_CHAIN + 1)
+#define FDB_TC_SLOW_PATH_CHAIN (FDB_FT_CHAIN + 1)
+
+/* The index of the last real chain (FT) + 1 as chain zero is valid as well */
+#define FDB_NUM_CHAINS (FDB_FT_CHAIN + 1)
+
 #define FDB_TC_MAX_PRIO 16
 #define FDB_TC_LEVELS_PER_PRIO 2
 
@@ -173,7 +178,7 @@ struct mlx5_eswitch_fdb {
 			struct {
 				struct mlx5_flow_table *fdb;
 				u32 num_rules;
-			} fdb_prio[FDB_TC_MAX_CHAIN + 1][FDB_TC_MAX_PRIO + 1][FDB_TC_LEVELS_PER_PRIO];
+			} fdb_prio[FDB_NUM_CHAINS][FDB_TC_MAX_PRIO + 1][FDB_TC_LEVELS_PER_PRIO];
 			/* Protects fdb_prio table */
 			struct mutex fdb_prio_lock;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 4aa6990a38b3..84e90b21e148 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2624,16 +2624,19 @@ static int create_fdb_chains(struct mlx5_flow_steering *steering,
 
 static int create_fdb_fast_path(struct mlx5_flow_steering *steering)
 {
-	const int total_chains = FDB_TC_MAX_CHAIN + 1;
 	int err;
 
-	steering->fdb_sub_ns = kcalloc(total_chains,
+	steering->fdb_sub_ns = kcalloc(FDB_NUM_CHAINS,
 				       sizeof(*steering->fdb_sub_ns),
 				       GFP_KERNEL);
 	if (!steering->fdb_sub_ns)
 		return -ENOMEM;
 
-	err = create_fdb_chains(steering, FDB_FAST_PATH, FDB_TC_MAX_CHAIN + 1);
+	err = create_fdb_chains(steering, FDB_TC_OFFLOAD, FDB_TC_MAX_CHAIN + 1);
+	if (err)
+		return err;
+
+	err = create_fdb_chains(steering, FDB_FT_OFFLOAD, 1);
 	if (err)
 		return err;
 
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 724d276ea133..4e5b84e66822 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -80,7 +80,8 @@ enum mlx5_flow_namespace_type {
 
 enum {
 	FDB_BYPASS_PATH,
-	FDB_FAST_PATH,
+	FDB_TC_OFFLOAD,
+	FDB_FT_OFFLOAD,
 	FDB_SLOW_PATH,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From bd1903b7c4596ba6f7677d0dfefd05ba5876707d Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Wed, 13 Nov 2019 23:04:49 +0800
Subject: net: openvswitch: add hash info to upcall

When using the kernel datapath, the upcall don't
include skb hash info relatived. That will introduce
some problem, because the hash of skb is important
in kernel stack. For example, VXLAN module uses
it to select UDP src port. The tx queue selection
may also use the hash in stack.

Hash is computed in different ways. Hash is random
for a TCP socket, and hash may be computed in hardware,
or software stack. Recalculation hash is not easy.

Hash of TCP socket is computed:
tcp_v4_connect
    -> sk_set_txhash (is random)

__tcp_transmit_skb
    -> skb_set_hash_from_sk

There will be one upcall, without information of skb
hash, to ovs-vswitchd, for the first packet of a TCP
session. The rest packets will be processed in Open vSwitch
modules, hash kept. If this tcp session is forward to
VXLAN module, then the UDP src port of first tcp packet
is different from rest packets.

TCP packets may come from the host or dockers, to Open vSwitch.
To fix it, we store the hash info to upcall, and restore hash
when packets sent back.

+---------------+          +-------------------------+
|   Docker/VMs  |          |     ovs-vswitchd        |
+----+----------+          +-+--------------------+--+
     |                       ^                    |
     |                       |                    |
     |                       |  upcall            v restore packet hash (not recalculate)
     |                     +-+--------------------+--+
     |  tap netdev         |                         |   vxlan module
     +--------------->     +-->  Open vSwitch ko     +-->
       or internal type    |                         |
                           +-------------------------+

Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h |  4 +++-
 net/openvswitch/datapath.c       | 26 +++++++++++++++++++++++++-
 net/openvswitch/datapath.h       | 12 ++++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 1887a451c388..a87b44cd5590 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -173,6 +173,7 @@ enum ovs_packet_cmd {
  * @OVS_PACKET_ATTR_LEN: Packet size before truncation.
  * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
  * size.
+ * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb).
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
@@ -190,7 +191,8 @@ enum ovs_packet_attr {
 	OVS_PACKET_ATTR_PROBE,      /* Packet operation is a feature probe,
 				       error logging should be suppressed. */
 	OVS_PACKET_ATTR_MRU,	    /* Maximum received IP fragment size. */
-	OVS_PACKET_ATTR_LEN,		/* Packet size before truncation. */
+	OVS_PACKET_ATTR_LEN,	    /* Packet size before truncation. */
+	OVS_PACKET_ATTR_HASH,	    /* Packet hash. */
 	__OVS_PACKET_ATTR_MAX
 };
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2088619c03f0..8ce1f773378d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -350,7 +350,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
 		+ nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */
-		+ nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */
+		+ nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */
+		+ nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */
 
 	/* OVS_PACKET_ATTR_USERDATA */
 	if (upcall_info->userdata)
@@ -393,6 +394,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	size_t len;
 	unsigned int hlen;
 	int err, dp_ifindex;
+	u64 hash;
 
 	dp_ifindex = get_dpifindex(dp);
 	if (!dp_ifindex)
@@ -504,6 +506,19 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 		pad_packet(dp, user_skb);
 	}
 
+	/* Add OVS_PACKET_ATTR_HASH */
+	hash = skb_get_hash_raw(skb);
+	if (skb->sw_hash)
+		hash |= OVS_PACKET_HASH_SW_BIT;
+
+	if (skb->l4_hash)
+		hash |= OVS_PACKET_HASH_L4_BIT;
+
+	if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) {
+		err = -ENOBUFS;
+		goto out;
+	}
+
 	/* Only reserve room for attribute header, packet data is added
 	 * in skb_zerocopy() */
 	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) {
@@ -543,6 +558,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	struct datapath *dp;
 	struct vport *input_vport;
 	u16 mru = 0;
+	u64 hash;
 	int len;
 	int err;
 	bool log = !a[OVS_PACKET_ATTR_PROBE];
@@ -568,6 +584,14 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	}
 	OVS_CB(packet)->mru = mru;
 
+	if (a[OVS_PACKET_ATTR_HASH]) {
+		hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]);
+
+		__skb_set_hash(packet, hash & 0xFFFFFFFFULL,
+			       !!(hash & OVS_PACKET_HASH_SW_BIT),
+			       !!(hash & OVS_PACKET_HASH_L4_BIT));
+	}
+
 	/* Build an sw_flow for sending this packet. */
 	flow = ovs_flow_alloc();
 	err = PTR_ERR(flow);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 81e85dde8217..e239a46c2f94 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -139,6 +139,18 @@ struct ovs_net {
 	bool xt_label;
 };
 
+/**
+ * enum ovs_pkt_hash_types - hash info to include with a packet
+ * to send to userspace.
+ * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack.
+ * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash
+ * over transport ports.
+ */
+enum ovs_pkt_hash_types {
+	OVS_PACKET_HASH_SW_BIT = (1ULL << 32),
+	OVS_PACKET_HASH_L4_BIT = (1ULL << 33),
+};
+
 extern unsigned int ovs_net_id;
 void ovs_lock(void);
 void ovs_unlock(void);
-- 
cgit v1.2.3-59-g8ed1b


From 4d66c56f7efe122d09d06cd3ebfa52a43d51a9cb Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Wed, 13 Nov 2019 10:42:25 -0600
Subject: dt-bindings: net: dp83869: Add TI dp83869 phy

Add dt bindings for the TI dp83869 Gigabit ethernet phy
device.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
CC: Rob Herring <robh+dt@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/ti,dp83869.yaml        | 84 ++++++++++++++++++++++
 include/dt-bindings/net/ti-dp83869.h               | 42 +++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/ti,dp83869.yaml
 create mode 100644 include/dt-bindings/net/ti-dp83869.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/net/ti,dp83869.yaml b/Documentation/devicetree/bindings/net/ti,dp83869.yaml
new file mode 100644
index 000000000000..6fe3e451da8a
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/ti,dp83869.yaml
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2019 Texas Instruments Incorporated
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/net/ti,dp83869.yaml#"
+$schema: "http://devicetree.org/meta-schemas/core.yaml#"
+
+title: TI DP83869 ethernet PHY
+
+allOf:
+  - $ref: "ethernet-controller.yaml#"
+
+maintainers:
+  - Dan Murphy <dmurphy@ti.com>
+
+description: |
+  The DP83869HM device is a robust, fully-featured Gigabit (PHY) transceiver
+  with integrated PMD sublayers that supports 10BASE-Te, 100BASE-TX and
+  1000BASE-T Ethernet protocols. The DP83869 also supports 1000BASE-X and
+  100BASE-FX Fiber protocols.
+  This device interfaces to the MAC layer through Reduced GMII (RGMII) and
+  SGMII The DP83869HM supports Media Conversion in Managed mode. In this mode,
+  the DP83869HM can run 1000BASE-X-to-1000BASE-T and 100BASE-FX-to-100BASE-TX
+  conversions.  The DP83869HM can also support Bridge Conversion from RGMII to
+  SGMII and SGMII to RGMII.
+
+  Specifications about the charger can be found at:
+    http://www.ti.com/lit/ds/symlink/dp83869hm.pdf
+
+properties:
+  reg:
+    maxItems: 1
+
+  ti,min-output-impedance:
+    type: boolean
+    description: |
+       MAC Interface Impedance control to set the programmable output impedance
+       to a minimum value (35 ohms).
+
+  ti,max-output-impedance:
+    type: boolean
+    description: |
+       MAC Interface Impedance control to set the programmable output impedance
+       to a maximum value (70 ohms).
+
+  tx-fifo-depth:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description: |
+       Transmitt FIFO depth see dt-bindings/net/ti-dp83869.h for values
+
+  rx-fifo-depth:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description: |
+       Receive FIFO depth see dt-bindings/net/ti-dp83869.h for values
+
+  ti,clk-output-sel:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description: |
+       Muxing option for CLK_OUT pin see dt-bindings/net/ti-dp83869.h for values.
+
+  ti,op-mode:
+    $ref: /schemas/types.yaml#definitions/uint32
+    description: |
+       Operational mode for the PHY.  If this is not set then the operational
+       mode is set by the straps. see dt-bindings/net/ti-dp83869.h for values
+
+required:
+  - reg
+
+examples:
+  - |
+    #include <dt-bindings/net/ti-dp83869.h>
+    mdio0 {
+      #address-cells = <1>;
+      #size-cells = <0>;
+      ethphy0: ethernet-phy@0 {
+        reg = <0>;
+        tx-fifo-depth = <DP83869_PHYCR_FIFO_DEPTH_4_B_NIB>;
+        rx-fifo-depth = <DP83869_PHYCR_FIFO_DEPTH_4_B_NIB>;
+        ti,op-mode = <DP83869_RGMII_COPPER_ETHERNET>;
+        ti,max-output-impedance = "true";
+        ti,clk-output-sel = <DP83869_CLK_O_SEL_CHN_A_RCLK>;
+      };
+    };
diff --git a/include/dt-bindings/net/ti-dp83869.h b/include/dt-bindings/net/ti-dp83869.h
new file mode 100644
index 000000000000..218b1a64e975
--- /dev/null
+++ b/include/dt-bindings/net/ti-dp83869.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Device Tree constants for the Texas Instruments DP83869 PHY
+ *
+ * Author: Dan Murphy <dmurphy@ti.com>
+ *
+ * Copyright:   (C) 2019 Texas Instruments, Inc.
+ */
+
+#ifndef _DT_BINDINGS_TI_DP83869_H
+#define _DT_BINDINGS_TI_DP83869_H
+
+/* PHY CTRL bits */
+#define DP83869_PHYCR_FIFO_DEPTH_3_B_NIB	0x00
+#define DP83869_PHYCR_FIFO_DEPTH_4_B_NIB	0x01
+#define DP83869_PHYCR_FIFO_DEPTH_6_B_NIB	0x02
+#define DP83869_PHYCR_FIFO_DEPTH_8_B_NIB	0x03
+
+/* IO_MUX_CFG - Clock output selection */
+#define DP83869_CLK_O_SEL_CHN_A_RCLK		0x0
+#define DP83869_CLK_O_SEL_CHN_B_RCLK		0x1
+#define DP83869_CLK_O_SEL_CHN_C_RCLK		0x2
+#define DP83869_CLK_O_SEL_CHN_D_RCLK		0x3
+#define DP83869_CLK_O_SEL_CHN_A_RCLK_DIV5	0x4
+#define DP83869_CLK_O_SEL_CHN_B_RCLK_DIV5	0x5
+#define DP83869_CLK_O_SEL_CHN_C_RCLK_DIV5	0x6
+#define DP83869_CLK_O_SEL_CHN_D_RCLK_DIV5	0x7
+#define DP83869_CLK_O_SEL_CHN_A_TCLK		0x8
+#define DP83869_CLK_O_SEL_CHN_B_TCLK		0x9
+#define DP83869_CLK_O_SEL_CHN_C_TCLK		0xa
+#define DP83869_CLK_O_SEL_CHN_D_TCLK		0xb
+#define DP83869_CLK_O_SEL_REF_CLK		0xc
+
+#define DP83869_RGMII_COPPER_ETHERNET		0x00
+#define DP83869_RGMII_1000_BASE			0x01
+#define DP83869_RGMII_100_BASE			0x02
+#define DP83869_RGMII_SGMII_BRIDGE		0x03
+#define DP83869_1000M_MEDIA_CONVERT		0x04
+#define DP83869_100M_MEDIA_CONVERT		0x05
+#define DP83869_SGMII_COPPER_ETHERNET		0x06
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From db205c766862edae48d64e69e2f2502e2a3e9135 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:37 +0100
Subject: vsock: remove vm_sockets_get_local_cid()

vm_sockets_get_local_cid() is only used in virtio_transport_common.c.
We can replace it calling the virtio_transport_get_ops() and
using the get_local_cid() callback registered by the transport.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/vm_sockets.h              |  2 --
 net/vmw_vsock/af_vsock.c                | 10 ----------
 net/vmw_vsock/virtio_transport_common.c |  2 +-
 3 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/vm_sockets.h b/include/linux/vm_sockets.h
index 33f1a2ecd905..7dd899ccb920 100644
--- a/include/linux/vm_sockets.h
+++ b/include/linux/vm_sockets.h
@@ -10,6 +10,4 @@
 
 #include <uapi/linux/vm_sockets.h>
 
-int vm_sockets_get_local_cid(void);
-
 #endif /* _VM_SOCKETS_H */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 1f4fde4711b6..eb13693e9d04 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -129,16 +129,6 @@ static struct proto vsock_proto = {
 static const struct vsock_transport *transport;
 static DEFINE_MUTEX(vsock_register_mutex);
 
-/**** EXPORTS ****/
-
-/* Get the ID of the local context.  This is transport dependent. */
-
-int vm_sockets_get_local_cid(void)
-{
-	return transport->get_local_cid();
-}
-EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid);
-
 /**** UTILS ****/
 
 /* Each bound VSocket is stored in the bind hash table and each connected
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 828edd88488c..3edc373d2acc 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -168,7 +168,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	struct virtio_vsock_pkt *pkt;
 	u32 pkt_len = info->pkt_len;
 
-	src_cid = vm_sockets_get_local_cid();
+	src_cid = virtio_transport_get_ops()->transport.get_local_cid();
 	src_port = vsk->local_addr.svm_port;
 	if (!info->remote_cid) {
 		dst_cid	= vsk->remote_addr.svm_cid;
-- 
cgit v1.2.3-59-g8ed1b


From 3603a2e991a82e5094c3107a792859b08342aed3 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:38 +0100
Subject: vsock: remove include/linux/vm_sockets.h file

This header file now only includes the "uapi/linux/vm_sockets.h".
We can include directly it when needed.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/vm_sockets.h            | 13 -------------
 include/net/af_vsock.h                |  2 +-
 include/net/vsock_addr.h              |  2 +-
 net/vmw_vsock/vmci_transport_notify.h |  1 -
 4 files changed, 2 insertions(+), 16 deletions(-)
 delete mode 100644 include/linux/vm_sockets.h

(limited to 'include')

diff --git a/include/linux/vm_sockets.h b/include/linux/vm_sockets.h
deleted file mode 100644
index 7dd899ccb920..000000000000
--- a/include/linux/vm_sockets.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * VMware vSockets Driver
- *
- * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
- */
-
-#ifndef _VM_SOCKETS_H
-#define _VM_SOCKETS_H
-
-#include <uapi/linux/vm_sockets.h>
-
-#endif /* _VM_SOCKETS_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 80ea0f93d3f7..c660402b10f2 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -10,7 +10,7 @@
 
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
-#include <linux/vm_sockets.h>
+#include <uapi/linux/vm_sockets.h>
 
 #include "vsock_addr.h"
 
diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h
index 57d2db5c4bdf..cf8cc140d68d 100644
--- a/include/net/vsock_addr.h
+++ b/include/net/vsock_addr.h
@@ -8,7 +8,7 @@
 #ifndef _VSOCK_ADDR_H_
 #define _VSOCK_ADDR_H_
 
-#include <linux/vm_sockets.h>
+#include <uapi/linux/vm_sockets.h>
 
 void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port);
 int vsock_addr_validate(const struct sockaddr_vm *addr);
diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h
index 7843f08d4290..a1aa5a998c0e 100644
--- a/net/vmw_vsock/vmci_transport_notify.h
+++ b/net/vmw_vsock/vmci_transport_notify.h
@@ -11,7 +11,6 @@
 #include <linux/types.h>
 #include <linux/vmw_vmci_defs.h>
 #include <linux/vmw_vmci_api.h>
-#include <linux/vm_sockets.h>
 
 #include "vmci_transport.h"
 
-- 
cgit v1.2.3-59-g8ed1b


From fe502c4a38d97e5f8b9d5602af1f07f5abc529d2 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:39 +0100
Subject: vsock: add 'transport' member in the struct vsock_sock

As a preparation to support multiple transports, this patch adds
the 'transport' member at the 'struct vsock_sock'.
This new field is initialized during the creation in the
__vsock_create() function.

This patch also renames the global 'transport' pointer to
'transport_single', since for now we're only supporting a single
transport registered at run-time.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_vsock.h   |  1 +
 net/vmw_vsock/af_vsock.c | 56 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 39 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index c660402b10f2..a5e1e134261d 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -27,6 +27,7 @@ extern spinlock_t vsock_table_lock;
 struct vsock_sock {
 	/* sk must be the first member. */
 	struct sock sk;
+	const struct vsock_transport *transport;
 	struct sockaddr_vm local_addr;
 	struct sockaddr_vm remote_addr;
 	/* Links for the global tables of bound and connected sockets. */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index eb13693e9d04..d813967d7dd5 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -126,7 +126,7 @@ static struct proto vsock_proto = {
  */
 #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
 
-static const struct vsock_transport *transport;
+static const struct vsock_transport *transport_single;
 static DEFINE_MUTEX(vsock_register_mutex);
 
 /**** UTILS ****/
@@ -408,7 +408,9 @@ static bool vsock_is_pending(struct sock *sk)
 
 static int vsock_send_shutdown(struct sock *sk, int mode)
 {
-	return transport->shutdown(vsock_sk(sk), mode);
+	struct vsock_sock *vsk = vsock_sk(sk);
+
+	return vsk->transport->shutdown(vsk, mode);
 }
 
 static void vsock_pending_work(struct work_struct *work)
@@ -518,7 +520,7 @@ static int __vsock_bind_stream(struct vsock_sock *vsk,
 static int __vsock_bind_dgram(struct vsock_sock *vsk,
 			      struct sockaddr_vm *addr)
 {
-	return transport->dgram_bind(vsk, addr);
+	return vsk->transport->dgram_bind(vsk, addr);
 }
 
 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
@@ -536,7 +538,7 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
 	 * like AF_INET prevents binding to a non-local IP address (in most
 	 * cases), we only allow binding to the local CID.
 	 */
-	cid = transport->get_local_cid();
+	cid = vsk->transport->get_local_cid();
 	if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY)
 		return -EADDRNOTAVAIL;
 
@@ -586,6 +588,7 @@ struct sock *__vsock_create(struct net *net,
 		sk->sk_type = type;
 
 	vsk = vsock_sk(sk);
+	vsk->transport = transport_single;
 	vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
 	vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
 
@@ -616,7 +619,7 @@ struct sock *__vsock_create(struct net *net,
 		vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
 	}
 
-	if (transport->init(vsk, psk) < 0) {
+	if (vsk->transport->init(vsk, psk) < 0) {
 		sk_free(sk);
 		return NULL;
 	}
@@ -640,7 +643,7 @@ static void __vsock_release(struct sock *sk, int level)
 		/* The release call is supposed to use lock_sock_nested()
 		 * rather than lock_sock(), if a sock lock should be acquired.
 		 */
-		transport->release(vsk);
+		vsk->transport->release(vsk);
 
 		/* When "level" is SINGLE_DEPTH_NESTING, use the nested
 		 * version to avoid the warning "possible recursive locking
@@ -668,7 +671,7 @@ static void vsock_sk_destruct(struct sock *sk)
 {
 	struct vsock_sock *vsk = vsock_sk(sk);
 
-	transport->destruct(vsk);
+	vsk->transport->destruct(vsk);
 
 	/* When clearing these addresses, there's no need to set the family and
 	 * possibly register the address family with the kernel.
@@ -692,13 +695,13 @@ static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 s64 vsock_stream_has_data(struct vsock_sock *vsk)
 {
-	return transport->stream_has_data(vsk);
+	return vsk->transport->stream_has_data(vsk);
 }
 EXPORT_SYMBOL_GPL(vsock_stream_has_data);
 
 s64 vsock_stream_has_space(struct vsock_sock *vsk)
 {
-	return transport->stream_has_space(vsk);
+	return vsk->transport->stream_has_space(vsk);
 }
 EXPORT_SYMBOL_GPL(vsock_stream_has_space);
 
@@ -867,6 +870,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 			mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 
 	} else if (sock->type == SOCK_STREAM) {
+		const struct vsock_transport *transport = vsk->transport;
 		lock_sock(sk);
 
 		/* Listening sockets that have connections in their accept
@@ -942,6 +946,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	struct sock *sk;
 	struct vsock_sock *vsk;
 	struct sockaddr_vm *remote_addr;
+	const struct vsock_transport *transport;
 
 	if (msg->msg_flags & MSG_OOB)
 		return -EOPNOTSUPP;
@@ -950,6 +955,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	err = 0;
 	sk = sock->sk;
 	vsk = vsock_sk(sk);
+	transport = vsk->transport;
 
 	lock_sock(sk);
 
@@ -1034,8 +1040,8 @@ static int vsock_dgram_connect(struct socket *sock,
 	if (err)
 		goto out;
 
-	if (!transport->dgram_allow(remote_addr->svm_cid,
-				    remote_addr->svm_port)) {
+	if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
+					 remote_addr->svm_port)) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -1051,7 +1057,9 @@ out:
 static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 			       size_t len, int flags)
 {
-	return transport->dgram_dequeue(vsock_sk(sock->sk), msg, len, flags);
+	struct vsock_sock *vsk = vsock_sk(sock->sk);
+
+	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
 }
 
 static const struct proto_ops vsock_dgram_ops = {
@@ -1077,6 +1085,8 @@ static const struct proto_ops vsock_dgram_ops = {
 
 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
 {
+	const struct vsock_transport *transport = vsk->transport;
+
 	if (!transport->cancel_pkt)
 		return -EOPNOTSUPP;
 
@@ -1113,6 +1123,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 	int err;
 	struct sock *sk;
 	struct vsock_sock *vsk;
+	const struct vsock_transport *transport;
 	struct sockaddr_vm *remote_addr;
 	long timeout;
 	DEFINE_WAIT(wait);
@@ -1120,6 +1131,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 	err = 0;
 	sk = sock->sk;
 	vsk = vsock_sk(sk);
+	transport = vsk->transport;
 
 	lock_sock(sk);
 
@@ -1363,6 +1375,7 @@ static int vsock_stream_setsockopt(struct socket *sock,
 	int err;
 	struct sock *sk;
 	struct vsock_sock *vsk;
+	const struct vsock_transport *transport;
 	u64 val;
 
 	if (level != AF_VSOCK)
@@ -1383,6 +1396,7 @@ static int vsock_stream_setsockopt(struct socket *sock,
 	err = 0;
 	sk = sock->sk;
 	vsk = vsock_sk(sk);
+	transport = vsk->transport;
 
 	lock_sock(sk);
 
@@ -1440,6 +1454,7 @@ static int vsock_stream_getsockopt(struct socket *sock,
 	int len;
 	struct sock *sk;
 	struct vsock_sock *vsk;
+	const struct vsock_transport *transport;
 	u64 val;
 
 	if (level != AF_VSOCK)
@@ -1463,6 +1478,7 @@ static int vsock_stream_getsockopt(struct socket *sock,
 	err = 0;
 	sk = sock->sk;
 	vsk = vsock_sk(sk);
+	transport = vsk->transport;
 
 	switch (optname) {
 	case SO_VM_SOCKETS_BUFFER_SIZE:
@@ -1507,6 +1523,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 {
 	struct sock *sk;
 	struct vsock_sock *vsk;
+	const struct vsock_transport *transport;
 	ssize_t total_written;
 	long timeout;
 	int err;
@@ -1515,6 +1532,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 
 	sk = sock->sk;
 	vsk = vsock_sk(sk);
+	transport = vsk->transport;
 	total_written = 0;
 	err = 0;
 
@@ -1646,6 +1664,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 {
 	struct sock *sk;
 	struct vsock_sock *vsk;
+	const struct vsock_transport *transport;
 	int err;
 	size_t target;
 	ssize_t copied;
@@ -1656,6 +1675,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	sk = sock->sk;
 	vsk = vsock_sk(sk);
+	transport = vsk->transport;
 	err = 0;
 
 	lock_sock(sk);
@@ -1870,7 +1890,7 @@ static long vsock_dev_do_ioctl(struct file *filp,
 
 	switch (cmd) {
 	case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
-		if (put_user(transport->get_local_cid(), p) != 0)
+		if (put_user(transport_single->get_local_cid(), p) != 0)
 			retval = -EFAULT;
 		break;
 
@@ -1917,7 +1937,7 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
 	if (err)
 		return err;
 
-	if (transport) {
+	if (transport_single) {
 		err = -EBUSY;
 		goto err_busy;
 	}
@@ -1926,7 +1946,7 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
 	 * unload while there are open sockets.
 	 */
 	vsock_proto.owner = owner;
-	transport = t;
+	transport_single = t;
 
 	vsock_device.minor = MISC_DYNAMIC_MINOR;
 	err = misc_register(&vsock_device);
@@ -1956,7 +1976,7 @@ err_unregister_proto:
 err_deregister_misc:
 	misc_deregister(&vsock_device);
 err_reset_transport:
-	transport = NULL;
+	transport_single = NULL;
 err_busy:
 	mutex_unlock(&vsock_register_mutex);
 	return err;
@@ -1973,7 +1993,7 @@ void vsock_core_exit(void)
 
 	/* We do not want the assignment below re-ordered. */
 	mb();
-	transport = NULL;
+	transport_single = NULL;
 
 	mutex_unlock(&vsock_register_mutex);
 }
@@ -1984,7 +2004,7 @@ const struct vsock_transport *vsock_core_get_transport(void)
 	/* vsock_register_mutex not taken since only the transport uses this
 	 * function and only while registered.
 	 */
-	return transport;
+	return transport_single;
 }
 EXPORT_SYMBOL_GPL(vsock_core_get_transport);
 
-- 
cgit v1.2.3-59-g8ed1b


From 4c7246dc45e2706770d5233f7ce1597a07e069ba Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:40 +0100
Subject: vsock/virtio: add transport parameter to the
 virtio_transport_reset_no_sock()

We are going to add 'struct vsock_sock *' parameter to
virtio_transport_get_ops().

In some cases, like in the virtio_transport_reset_no_sock(),
we don't have any socket assigned to the packet received,
so we can't use the virtio_transport_get_ops().

In order to allow virtio_transport_reset_no_sock() to use the
'.send_pkt' callback from the 'vhost_transport' or 'virtio_transport',
we add the 'struct virtio_transport *' to it and to its caller:
virtio_transport_recv_pkt().

We moved the 'vhost_transport' and 'virtio_transport' definition,
to pass their address to the virtio_transport_recv_pkt().

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vsock.c                   |  94 +++++++++----------
 include/linux/virtio_vsock.h            |   3 +-
 net/vmw_vsock/virtio_transport.c        | 160 ++++++++++++++++----------------
 net/vmw_vsock/virtio_transport_common.c |  12 +--
 4 files changed, 135 insertions(+), 134 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 9f57736fe15e..92ab3852c954 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -384,6 +384,52 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
 	return val < vq->num;
 }
 
+static struct virtio_transport vhost_transport = {
+	.transport = {
+		.get_local_cid            = vhost_transport_get_local_cid,
+
+		.init                     = virtio_transport_do_socket_init,
+		.destruct                 = virtio_transport_destruct,
+		.release                  = virtio_transport_release,
+		.connect                  = virtio_transport_connect,
+		.shutdown                 = virtio_transport_shutdown,
+		.cancel_pkt               = vhost_transport_cancel_pkt,
+
+		.dgram_enqueue            = virtio_transport_dgram_enqueue,
+		.dgram_dequeue            = virtio_transport_dgram_dequeue,
+		.dgram_bind               = virtio_transport_dgram_bind,
+		.dgram_allow              = virtio_transport_dgram_allow,
+
+		.stream_enqueue           = virtio_transport_stream_enqueue,
+		.stream_dequeue           = virtio_transport_stream_dequeue,
+		.stream_has_data          = virtio_transport_stream_has_data,
+		.stream_has_space         = virtio_transport_stream_has_space,
+		.stream_rcvhiwat          = virtio_transport_stream_rcvhiwat,
+		.stream_is_active         = virtio_transport_stream_is_active,
+		.stream_allow             = virtio_transport_stream_allow,
+
+		.notify_poll_in           = virtio_transport_notify_poll_in,
+		.notify_poll_out          = virtio_transport_notify_poll_out,
+		.notify_recv_init         = virtio_transport_notify_recv_init,
+		.notify_recv_pre_block    = virtio_transport_notify_recv_pre_block,
+		.notify_recv_pre_dequeue  = virtio_transport_notify_recv_pre_dequeue,
+		.notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+		.notify_send_init         = virtio_transport_notify_send_init,
+		.notify_send_pre_block    = virtio_transport_notify_send_pre_block,
+		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
+		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+
+		.set_buffer_size          = virtio_transport_set_buffer_size,
+		.set_min_buffer_size      = virtio_transport_set_min_buffer_size,
+		.set_max_buffer_size      = virtio_transport_set_max_buffer_size,
+		.get_buffer_size          = virtio_transport_get_buffer_size,
+		.get_min_buffer_size      = virtio_transport_get_min_buffer_size,
+		.get_max_buffer_size      = virtio_transport_get_max_buffer_size,
+	},
+
+	.send_pkt = vhost_transport_send_pkt,
+};
+
 static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 {
 	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
@@ -438,7 +484,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 
 		/* Only accept correctly addressed packets */
 		if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid)
-			virtio_transport_recv_pkt(pkt);
+			virtio_transport_recv_pkt(&vhost_transport, pkt);
 		else
 			virtio_transport_free_pkt(pkt);
 
@@ -786,52 +832,6 @@ static struct miscdevice vhost_vsock_misc = {
 	.fops = &vhost_vsock_fops,
 };
 
-static struct virtio_transport vhost_transport = {
-	.transport = {
-		.get_local_cid            = vhost_transport_get_local_cid,
-
-		.init                     = virtio_transport_do_socket_init,
-		.destruct                 = virtio_transport_destruct,
-		.release                  = virtio_transport_release,
-		.connect                  = virtio_transport_connect,
-		.shutdown                 = virtio_transport_shutdown,
-		.cancel_pkt               = vhost_transport_cancel_pkt,
-
-		.dgram_enqueue            = virtio_transport_dgram_enqueue,
-		.dgram_dequeue            = virtio_transport_dgram_dequeue,
-		.dgram_bind               = virtio_transport_dgram_bind,
-		.dgram_allow              = virtio_transport_dgram_allow,
-
-		.stream_enqueue           = virtio_transport_stream_enqueue,
-		.stream_dequeue           = virtio_transport_stream_dequeue,
-		.stream_has_data          = virtio_transport_stream_has_data,
-		.stream_has_space         = virtio_transport_stream_has_space,
-		.stream_rcvhiwat          = virtio_transport_stream_rcvhiwat,
-		.stream_is_active         = virtio_transport_stream_is_active,
-		.stream_allow             = virtio_transport_stream_allow,
-
-		.notify_poll_in           = virtio_transport_notify_poll_in,
-		.notify_poll_out          = virtio_transport_notify_poll_out,
-		.notify_recv_init         = virtio_transport_notify_recv_init,
-		.notify_recv_pre_block    = virtio_transport_notify_recv_pre_block,
-		.notify_recv_pre_dequeue  = virtio_transport_notify_recv_pre_dequeue,
-		.notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
-		.notify_send_init         = virtio_transport_notify_send_init,
-		.notify_send_pre_block    = virtio_transport_notify_send_pre_block,
-		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
-		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
-
-		.set_buffer_size          = virtio_transport_set_buffer_size,
-		.set_min_buffer_size      = virtio_transport_set_min_buffer_size,
-		.set_max_buffer_size      = virtio_transport_set_max_buffer_size,
-		.get_buffer_size          = virtio_transport_get_buffer_size,
-		.get_min_buffer_size      = virtio_transport_get_min_buffer_size,
-		.get_max_buffer_size      = virtio_transport_get_max_buffer_size,
-	},
-
-	.send_pkt = vhost_transport_send_pkt,
-};
-
 static int __init vhost_vsock_init(void)
 {
 	int ret;
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 07875ccc7bb5..b139f76060a6 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -150,7 +150,8 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
 
 void virtio_transport_destruct(struct vsock_sock *vsk);
 
-void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt);
+void virtio_transport_recv_pkt(struct virtio_transport *t,
+			       struct virtio_vsock_pkt *pkt);
 void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt);
 void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt);
 u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 082a30936690..3756f0857946 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -86,33 +86,6 @@ out_rcu:
 	return ret;
 }
 
-static void virtio_transport_loopback_work(struct work_struct *work)
-{
-	struct virtio_vsock *vsock =
-		container_of(work, struct virtio_vsock, loopback_work);
-	LIST_HEAD(pkts);
-
-	spin_lock_bh(&vsock->loopback_list_lock);
-	list_splice_init(&vsock->loopback_list, &pkts);
-	spin_unlock_bh(&vsock->loopback_list_lock);
-
-	mutex_lock(&vsock->rx_lock);
-
-	if (!vsock->rx_run)
-		goto out;
-
-	while (!list_empty(&pkts)) {
-		struct virtio_vsock_pkt *pkt;
-
-		pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
-		list_del_init(&pkt->list);
-
-		virtio_transport_recv_pkt(pkt);
-	}
-out:
-	mutex_unlock(&vsock->rx_lock);
-}
-
 static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
 					      struct virtio_vsock_pkt *pkt)
 {
@@ -370,59 +343,6 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
 	return val < virtqueue_get_vring_size(vq);
 }
 
-static void virtio_transport_rx_work(struct work_struct *work)
-{
-	struct virtio_vsock *vsock =
-		container_of(work, struct virtio_vsock, rx_work);
-	struct virtqueue *vq;
-
-	vq = vsock->vqs[VSOCK_VQ_RX];
-
-	mutex_lock(&vsock->rx_lock);
-
-	if (!vsock->rx_run)
-		goto out;
-
-	do {
-		virtqueue_disable_cb(vq);
-		for (;;) {
-			struct virtio_vsock_pkt *pkt;
-			unsigned int len;
-
-			if (!virtio_transport_more_replies(vsock)) {
-				/* Stop rx until the device processes already
-				 * pending replies.  Leave rx virtqueue
-				 * callbacks disabled.
-				 */
-				goto out;
-			}
-
-			pkt = virtqueue_get_buf(vq, &len);
-			if (!pkt) {
-				break;
-			}
-
-			vsock->rx_buf_nr--;
-
-			/* Drop short/long packets */
-			if (unlikely(len < sizeof(pkt->hdr) ||
-				     len > sizeof(pkt->hdr) + pkt->len)) {
-				virtio_transport_free_pkt(pkt);
-				continue;
-			}
-
-			pkt->len = len - sizeof(pkt->hdr);
-			virtio_transport_deliver_tap_pkt(pkt);
-			virtio_transport_recv_pkt(pkt);
-		}
-	} while (!virtqueue_enable_cb(vq));
-
-out:
-	if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
-		virtio_vsock_rx_fill(vsock);
-	mutex_unlock(&vsock->rx_lock);
-}
-
 /* event_lock must be held */
 static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock,
 				       struct virtio_vsock_event *event)
@@ -586,6 +506,86 @@ static struct virtio_transport virtio_transport = {
 	.send_pkt = virtio_transport_send_pkt,
 };
 
+static void virtio_transport_loopback_work(struct work_struct *work)
+{
+	struct virtio_vsock *vsock =
+		container_of(work, struct virtio_vsock, loopback_work);
+	LIST_HEAD(pkts);
+
+	spin_lock_bh(&vsock->loopback_list_lock);
+	list_splice_init(&vsock->loopback_list, &pkts);
+	spin_unlock_bh(&vsock->loopback_list_lock);
+
+	mutex_lock(&vsock->rx_lock);
+
+	if (!vsock->rx_run)
+		goto out;
+
+	while (!list_empty(&pkts)) {
+		struct virtio_vsock_pkt *pkt;
+
+		pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
+		list_del_init(&pkt->list);
+
+		virtio_transport_recv_pkt(&virtio_transport, pkt);
+	}
+out:
+	mutex_unlock(&vsock->rx_lock);
+}
+
+static void virtio_transport_rx_work(struct work_struct *work)
+{
+	struct virtio_vsock *vsock =
+		container_of(work, struct virtio_vsock, rx_work);
+	struct virtqueue *vq;
+
+	vq = vsock->vqs[VSOCK_VQ_RX];
+
+	mutex_lock(&vsock->rx_lock);
+
+	if (!vsock->rx_run)
+		goto out;
+
+	do {
+		virtqueue_disable_cb(vq);
+		for (;;) {
+			struct virtio_vsock_pkt *pkt;
+			unsigned int len;
+
+			if (!virtio_transport_more_replies(vsock)) {
+				/* Stop rx until the device processes already
+				 * pending replies.  Leave rx virtqueue
+				 * callbacks disabled.
+				 */
+				goto out;
+			}
+
+			pkt = virtqueue_get_buf(vq, &len);
+			if (!pkt) {
+				break;
+			}
+
+			vsock->rx_buf_nr--;
+
+			/* Drop short/long packets */
+			if (unlikely(len < sizeof(pkt->hdr) ||
+				     len > sizeof(pkt->hdr) + pkt->len)) {
+				virtio_transport_free_pkt(pkt);
+				continue;
+			}
+
+			pkt->len = len - sizeof(pkt->hdr);
+			virtio_transport_deliver_tap_pkt(pkt);
+			virtio_transport_recv_pkt(&virtio_transport, pkt);
+		}
+	} while (!virtqueue_enable_cb(vq));
+
+out:
+	if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
+		virtio_vsock_rx_fill(vsock);
+	mutex_unlock(&vsock->rx_lock);
+}
+
 static int virtio_vsock_probe(struct virtio_device *vdev)
 {
 	vq_callback_t *callbacks[] = {
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 3edc373d2acc..e7b5e99842c9 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -745,9 +745,9 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
 /* Normally packets are associated with a socket.  There may be no socket if an
  * attempt was made to connect to a socket that does not exist.
  */
-static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
+					  struct virtio_vsock_pkt *pkt)
 {
-	const struct virtio_transport *t;
 	struct virtio_vsock_pkt *reply;
 	struct virtio_vsock_pkt_info info = {
 		.op = VIRTIO_VSOCK_OP_RST,
@@ -767,7 +767,6 @@ static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
 	if (!reply)
 		return -ENOMEM;
 
-	t = virtio_transport_get_ops();
 	if (!t) {
 		virtio_transport_free_pkt(reply);
 		return -ENOTCONN;
@@ -1109,7 +1108,8 @@ static bool virtio_transport_space_update(struct sock *sk,
 /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
  * lock.
  */
-void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+void virtio_transport_recv_pkt(struct virtio_transport *t,
+			       struct virtio_vsock_pkt *pkt)
 {
 	struct sockaddr_vm src, dst;
 	struct vsock_sock *vsk;
@@ -1131,7 +1131,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
 					le32_to_cpu(pkt->hdr.fwd_cnt));
 
 	if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
-		(void)virtio_transport_reset_no_sock(pkt);
+		(void)virtio_transport_reset_no_sock(t, pkt);
 		goto free_pkt;
 	}
 
@@ -1142,7 +1142,7 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
 	if (!sk) {
 		sk = vsock_find_bound_socket(&dst);
 		if (!sk) {
-			(void)virtio_transport_reset_no_sock(pkt);
+			(void)virtio_transport_reset_no_sock(t, pkt);
 			goto free_pkt;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From daabfbca34ecfa936d3bf5219167c4c5e67db150 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:41 +0100
Subject: vsock: add 'struct vsock_sock *' param to vsock_core_get_transport()

Since now the 'struct vsock_sock' object contains a pointer to
the transport, this patch adds a parameter to the
vsock_core_get_transport() to return the right transport
assigned to the socket.

This patch modifies also the virtio_transport_get_ops(), that
uses the vsock_core_get_transport(), adding the
'struct vsock_sock *' parameter.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_vsock.h                  | 2 +-
 net/vmw_vsock/af_vsock.c                | 7 ++-----
 net/vmw_vsock/virtio_transport_common.c | 9 +++++----
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index a5e1e134261d..2ca67d048de4 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -166,7 +166,7 @@ static inline int vsock_core_init(const struct vsock_transport *t)
 void vsock_core_exit(void);
 
 /* The transport may downcast this to access transport-specific functions */
-const struct vsock_transport *vsock_core_get_transport(void);
+const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk);
 
 /**** UTILS ****/
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index d813967d7dd5..f057acb0ee29 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1999,12 +1999,9 @@ void vsock_core_exit(void)
 }
 EXPORT_SYMBOL_GPL(vsock_core_exit);
 
-const struct vsock_transport *vsock_core_get_transport(void)
+const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk)
 {
-	/* vsock_register_mutex not taken since only the transport uses this
-	 * function and only while registered.
-	 */
-	return transport_single;
+	return vsk->transport;
 }
 EXPORT_SYMBOL_GPL(vsock_core_get_transport);
 
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index e7b5e99842c9..b113619d9576 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -29,9 +29,10 @@
 /* Threshold for detecting small packets to copy */
 #define GOOD_COPY_LEN  128
 
-static const struct virtio_transport *virtio_transport_get_ops(void)
+static const struct virtio_transport *
+virtio_transport_get_ops(struct vsock_sock *vsk)
 {
-	const struct vsock_transport *t = vsock_core_get_transport();
+	const struct vsock_transport *t = vsock_core_get_transport(vsk);
 
 	return container_of(t, struct virtio_transport, transport);
 }
@@ -168,7 +169,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 	struct virtio_vsock_pkt *pkt;
 	u32 pkt_len = info->pkt_len;
 
-	src_cid = virtio_transport_get_ops()->transport.get_local_cid();
+	src_cid = virtio_transport_get_ops(vsk)->transport.get_local_cid();
 	src_port = vsk->local_addr.svm_port;
 	if (!info->remote_cid) {
 		dst_cid	= vsk->remote_addr.svm_cid;
@@ -201,7 +202,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 
 	virtio_transport_inc_tx_pkt(vvs, pkt);
 
-	return virtio_transport_get_ops()->send_pkt(pkt);
+	return virtio_transport_get_ops(vsk)->send_pkt(pkt);
 }
 
 static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
-- 
cgit v1.2.3-59-g8ed1b


From b9f2b0ffde0c9b666b2b1672eb468b8f805a9b97 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:42 +0100
Subject: vsock: handle buffer_size sockopts in the core

virtio_transport and vmci_transport handle the buffer_size
sockopts in a very similar way.

In order to support multiple transports, this patch moves this
handling in the core to allow the user to change the options
also if the socket is not yet assigned to any transport.

This patch also adds the '.notify_buffer_size' callback in the
'struct virtio_transport' in order to inform the transport,
when the buffer_size is changed by the user. It is also useful
to limit the 'buffer_size' requested (e.g. virtio transports).

Acked-by: Dexuan Cui <decui@microsoft.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vsock.c                   |  7 +--
 include/linux/virtio_vsock.h            | 15 +-----
 include/net/af_vsock.h                  | 15 +++---
 net/vmw_vsock/af_vsock.c                | 43 ++++++++++++++---
 net/vmw_vsock/hyperv_transport.c        | 36 --------------
 net/vmw_vsock/virtio_transport.c        |  8 +--
 net/vmw_vsock/virtio_transport_common.c | 79 +++++-------------------------
 net/vmw_vsock/vmci_transport.c          | 86 ++++-----------------------------
 net/vmw_vsock/vmci_transport.h          |  3 --
 9 files changed, 65 insertions(+), 227 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 92ab3852c954..6d7e4f022748 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -418,13 +418,8 @@ static struct virtio_transport vhost_transport = {
 		.notify_send_pre_block    = virtio_transport_notify_send_pre_block,
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+		.notify_buffer_size       = virtio_transport_notify_buffer_size,
 
-		.set_buffer_size          = virtio_transport_set_buffer_size,
-		.set_min_buffer_size      = virtio_transport_set_min_buffer_size,
-		.set_max_buffer_size      = virtio_transport_set_max_buffer_size,
-		.get_buffer_size          = virtio_transport_get_buffer_size,
-		.get_min_buffer_size      = virtio_transport_get_min_buffer_size,
-		.get_max_buffer_size      = virtio_transport_get_max_buffer_size,
 	},
 
 	.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index b139f76060a6..71c81e0dc8f2 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -7,9 +7,6 @@
 #include <net/sock.h>
 #include <net/af_vsock.h>
 
-#define VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE	128
-#define VIRTIO_VSOCK_DEFAULT_BUF_SIZE		(1024 * 256)
-#define VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE	(1024 * 256)
 #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
 #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
 #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
@@ -25,11 +22,6 @@ enum {
 struct virtio_vsock_sock {
 	struct vsock_sock *vsk;
 
-	/* Protected by lock_sock(sk_vsock(trans->vsk)) */
-	u32 buf_size;
-	u32 buf_size_min;
-	u32 buf_size_max;
-
 	spinlock_t tx_lock;
 	spinlock_t rx_lock;
 
@@ -92,12 +84,6 @@ s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);
 
 int virtio_transport_do_socket_init(struct vsock_sock *vsk,
 				 struct vsock_sock *psk);
-u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk);
-u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk);
-u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk);
-void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val);
-void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val);
-void virtio_transport_set_max_buffer_size(struct vsock_sock *vs, u64 val);
 int
 virtio_transport_notify_poll_in(struct vsock_sock *vsk,
 				size_t target,
@@ -124,6 +110,7 @@ int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
 	struct vsock_transport_send_notify_data *data);
 int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
 	ssize_t written, struct vsock_transport_send_notify_data *data);
+void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val);
 
 u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
 bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 2ca67d048de4..4b5d16840fd4 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -65,6 +65,11 @@ struct vsock_sock {
 	bool sent_request;
 	bool ignore_connecting_rst;
 
+	/* Protected by lock_sock(sk) */
+	u64 buffer_size;
+	u64 buffer_min_size;
+	u64 buffer_max_size;
+
 	/* Private to transport. */
 	void *trans;
 };
@@ -140,18 +145,12 @@ struct vsock_transport {
 		struct vsock_transport_send_notify_data *);
 	int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t,
 		struct vsock_transport_send_notify_data *);
+	/* sk_lock held by the caller */
+	void (*notify_buffer_size)(struct vsock_sock *, u64 *);
 
 	/* Shutdown. */
 	int (*shutdown)(struct vsock_sock *, int);
 
-	/* Buffer sizes. */
-	void (*set_buffer_size)(struct vsock_sock *, u64);
-	void (*set_min_buffer_size)(struct vsock_sock *, u64);
-	void (*set_max_buffer_size)(struct vsock_sock *, u64);
-	u64 (*get_buffer_size)(struct vsock_sock *);
-	u64 (*get_min_buffer_size)(struct vsock_sock *);
-	u64 (*get_max_buffer_size)(struct vsock_sock *);
-
 	/* Addressing. */
 	u32 (*get_local_cid)(void);
 };
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index f057acb0ee29..11b88094e3b2 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -126,6 +126,10 @@ static struct proto vsock_proto = {
  */
 #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ)
 
+#define VSOCK_DEFAULT_BUFFER_SIZE     (1024 * 256)
+#define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256)
+#define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128
+
 static const struct vsock_transport *transport_single;
 static DEFINE_MUTEX(vsock_register_mutex);
 
@@ -613,10 +617,16 @@ struct sock *__vsock_create(struct net *net,
 		vsk->trusted = psk->trusted;
 		vsk->owner = get_cred(psk->owner);
 		vsk->connect_timeout = psk->connect_timeout;
+		vsk->buffer_size = psk->buffer_size;
+		vsk->buffer_min_size = psk->buffer_min_size;
+		vsk->buffer_max_size = psk->buffer_max_size;
 	} else {
 		vsk->trusted = capable(CAP_NET_ADMIN);
 		vsk->owner = get_current_cred();
 		vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT;
+		vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE;
+		vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE;
+		vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE;
 	}
 
 	if (vsk->transport->init(vsk, psk) < 0) {
@@ -1366,6 +1376,23 @@ out:
 	return err;
 }
 
+static void vsock_update_buffer_size(struct vsock_sock *vsk,
+				     const struct vsock_transport *transport,
+				     u64 val)
+{
+	if (val > vsk->buffer_max_size)
+		val = vsk->buffer_max_size;
+
+	if (val < vsk->buffer_min_size)
+		val = vsk->buffer_min_size;
+
+	if (val != vsk->buffer_size &&
+	    transport && transport->notify_buffer_size)
+		transport->notify_buffer_size(vsk, &val);
+
+	vsk->buffer_size = val;
+}
+
 static int vsock_stream_setsockopt(struct socket *sock,
 				   int level,
 				   int optname,
@@ -1403,17 +1430,19 @@ static int vsock_stream_setsockopt(struct socket *sock,
 	switch (optname) {
 	case SO_VM_SOCKETS_BUFFER_SIZE:
 		COPY_IN(val);
-		transport->set_buffer_size(vsk, val);
+		vsock_update_buffer_size(vsk, transport, val);
 		break;
 
 	case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
 		COPY_IN(val);
-		transport->set_max_buffer_size(vsk, val);
+		vsk->buffer_max_size = val;
+		vsock_update_buffer_size(vsk, transport, vsk->buffer_size);
 		break;
 
 	case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
 		COPY_IN(val);
-		transport->set_min_buffer_size(vsk, val);
+		vsk->buffer_min_size = val;
+		vsock_update_buffer_size(vsk, transport, vsk->buffer_size);
 		break;
 
 	case SO_VM_SOCKETS_CONNECT_TIMEOUT: {
@@ -1454,7 +1483,6 @@ static int vsock_stream_getsockopt(struct socket *sock,
 	int len;
 	struct sock *sk;
 	struct vsock_sock *vsk;
-	const struct vsock_transport *transport;
 	u64 val;
 
 	if (level != AF_VSOCK)
@@ -1478,21 +1506,20 @@ static int vsock_stream_getsockopt(struct socket *sock,
 	err = 0;
 	sk = sock->sk;
 	vsk = vsock_sk(sk);
-	transport = vsk->transport;
 
 	switch (optname) {
 	case SO_VM_SOCKETS_BUFFER_SIZE:
-		val = transport->get_buffer_size(vsk);
+		val = vsk->buffer_size;
 		COPY_OUT(val);
 		break;
 
 	case SO_VM_SOCKETS_BUFFER_MAX_SIZE:
-		val = transport->get_max_buffer_size(vsk);
+		val = vsk->buffer_max_size;
 		COPY_OUT(val);
 		break;
 
 	case SO_VM_SOCKETS_BUFFER_MIN_SIZE:
-		val = transport->get_min_buffer_size(vsk);
+		val = vsk->buffer_min_size;
 		COPY_OUT(val);
 		break;
 
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 7fa09c5e4625..ab947561543e 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -845,36 +845,6 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written,
 	return 0;
 }
 
-static void hvs_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	/* Ignored. */
-}
-
-static void hvs_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	/* Ignored. */
-}
-
-static void hvs_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	/* Ignored. */
-}
-
-static u64 hvs_get_buffer_size(struct vsock_sock *vsk)
-{
-	return -ENOPROTOOPT;
-}
-
-static u64 hvs_get_min_buffer_size(struct vsock_sock *vsk)
-{
-	return -ENOPROTOOPT;
-}
-
-static u64 hvs_get_max_buffer_size(struct vsock_sock *vsk)
-{
-	return -ENOPROTOOPT;
-}
-
 static struct vsock_transport hvs_transport = {
 	.get_local_cid            = hvs_get_local_cid,
 
@@ -908,12 +878,6 @@ static struct vsock_transport hvs_transport = {
 	.notify_send_pre_enqueue  = hvs_notify_send_pre_enqueue,
 	.notify_send_post_enqueue = hvs_notify_send_post_enqueue,
 
-	.set_buffer_size          = hvs_set_buffer_size,
-	.set_min_buffer_size      = hvs_set_min_buffer_size,
-	.set_max_buffer_size      = hvs_set_max_buffer_size,
-	.get_buffer_size          = hvs_get_buffer_size,
-	.get_min_buffer_size      = hvs_get_min_buffer_size,
-	.get_max_buffer_size      = hvs_get_max_buffer_size,
 };
 
 static int hvs_probe(struct hv_device *hdev,
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 3756f0857946..fb1fc7760e8c 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -494,13 +494,7 @@ static struct virtio_transport virtio_transport = {
 		.notify_send_pre_block    = virtio_transport_notify_send_pre_block,
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
-
-		.set_buffer_size          = virtio_transport_set_buffer_size,
-		.set_min_buffer_size      = virtio_transport_set_min_buffer_size,
-		.set_max_buffer_size      = virtio_transport_set_max_buffer_size,
-		.get_buffer_size          = virtio_transport_get_buffer_size,
-		.get_min_buffer_size      = virtio_transport_get_min_buffer_size,
-		.get_max_buffer_size      = virtio_transport_get_max_buffer_size,
+		.notify_buffer_size       = virtio_transport_notify_buffer_size,
 	},
 
 	.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index b113619d9576..d4a0bf19aa98 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -456,17 +456,13 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk,
 	if (psk) {
 		struct virtio_vsock_sock *ptrans = psk->trans;
 
-		vvs->buf_size	= ptrans->buf_size;
-		vvs->buf_size_min = ptrans->buf_size_min;
-		vvs->buf_size_max = ptrans->buf_size_max;
 		vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
-	} else {
-		vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
-		vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
-		vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
 	}
 
-	vvs->buf_alloc = vvs->buf_size;
+	if (vsk->buffer_size > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		vsk->buffer_size = VIRTIO_VSOCK_MAX_BUF_SIZE;
+
+	vvs->buf_alloc = vsk->buffer_size;
 
 	spin_lock_init(&vvs->rx_lock);
 	spin_lock_init(&vvs->tx_lock);
@@ -476,71 +472,20 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk,
 }
 EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
 
-u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
-{
-	struct virtio_vsock_sock *vvs = vsk->trans;
-
-	return vvs->buf_size;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
-
-u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
+/* sk_lock held by the caller */
+void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val)
 {
 	struct virtio_vsock_sock *vvs = vsk->trans;
 
-	return vvs->buf_size_min;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
-
-u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
-{
-	struct virtio_vsock_sock *vvs = vsk->trans;
-
-	return vvs->buf_size_max;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
-
-void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	struct virtio_vsock_sock *vvs = vsk->trans;
+	if (*val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+		*val = VIRTIO_VSOCK_MAX_BUF_SIZE;
 
-	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
-		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
-	if (val < vvs->buf_size_min)
-		vvs->buf_size_min = val;
-	if (val > vvs->buf_size_max)
-		vvs->buf_size_max = val;
-	vvs->buf_size = val;
-	vvs->buf_alloc = val;
+	vvs->buf_alloc = *val;
 
 	virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
 					    NULL);
 }
-EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
-
-void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	struct virtio_vsock_sock *vvs = vsk->trans;
-
-	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
-		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
-	if (val > vvs->buf_size)
-		vvs->buf_size = val;
-	vvs->buf_size_min = val;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
-
-void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	struct virtio_vsock_sock *vvs = vsk->trans;
-
-	if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
-		val = VIRTIO_VSOCK_MAX_BUF_SIZE;
-	if (val < vvs->buf_size)
-		vvs->buf_size = val;
-	vvs->buf_size_max = val;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+EXPORT_SYMBOL_GPL(virtio_transport_notify_buffer_size);
 
 int
 virtio_transport_notify_poll_in(struct vsock_sock *vsk,
@@ -632,9 +577,7 @@ EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
 
 u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
 {
-	struct virtio_vsock_sock *vvs = vsk->trans;
-
-	return vvs->buf_size;
+	return vsk->buffer_size;
 }
 EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
 
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index cf3b78f0038f..608bb6bd79aa 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -74,10 +74,6 @@ static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
 
 static int PROTOCOL_OVERRIDE = -1;
 
-#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN   128
-#define VMCI_TRANSPORT_DEFAULT_QP_SIZE       262144
-#define VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX   262144
-
 /* Helper function to convert from a VMCI error code to a VSock error code. */
 
 static s32 vmci_transport_error_to_vsock_error(s32 vmci_error)
@@ -1025,11 +1021,11 @@ static int vmci_transport_recv_listen(struct sock *sk,
 	/* If the proposed size fits within our min/max, accept it. Otherwise
 	 * propose our own size.
 	 */
-	if (pkt->u.size >= vmci_trans(vpending)->queue_pair_min_size &&
-	    pkt->u.size <= vmci_trans(vpending)->queue_pair_max_size) {
+	if (pkt->u.size >= vpending->buffer_min_size &&
+	    pkt->u.size <= vpending->buffer_max_size) {
 		qp_size = pkt->u.size;
 	} else {
-		qp_size = vmci_trans(vpending)->queue_pair_size;
+		qp_size = vpending->buffer_size;
 	}
 
 	/* Figure out if we are using old or new requests based on the
@@ -1098,7 +1094,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
 	pending->sk_state = TCP_SYN_SENT;
 	vmci_trans(vpending)->produce_size =
 		vmci_trans(vpending)->consume_size = qp_size;
-	vmci_trans(vpending)->queue_pair_size = qp_size;
+	vpending->buffer_size = qp_size;
 
 	vmci_trans(vpending)->notify_ops->process_request(pending);
 
@@ -1392,8 +1388,8 @@ static int vmci_transport_recv_connecting_client_negotiate(
 	vsk->ignore_connecting_rst = false;
 
 	/* Verify that we're OK with the proposed queue pair size */
-	if (pkt->u.size < vmci_trans(vsk)->queue_pair_min_size ||
-	    pkt->u.size > vmci_trans(vsk)->queue_pair_max_size) {
+	if (pkt->u.size < vsk->buffer_min_size ||
+	    pkt->u.size > vsk->buffer_max_size) {
 		err = -EINVAL;
 		goto destroy;
 	}
@@ -1498,8 +1494,7 @@ vmci_transport_recv_connecting_client_invalid(struct sock *sk,
 		vsk->sent_request = false;
 		vsk->ignore_connecting_rst = true;
 
-		err = vmci_transport_send_conn_request(
-			sk, vmci_trans(vsk)->queue_pair_size);
+		err = vmci_transport_send_conn_request(sk, vsk->buffer_size);
 		if (err < 0)
 			err = vmci_transport_error_to_vsock_error(err);
 		else
@@ -1583,21 +1578,6 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk,
 	INIT_LIST_HEAD(&vmci_trans(vsk)->elem);
 	vmci_trans(vsk)->sk = &vsk->sk;
 	spin_lock_init(&vmci_trans(vsk)->lock);
-	if (psk) {
-		vmci_trans(vsk)->queue_pair_size =
-			vmci_trans(psk)->queue_pair_size;
-		vmci_trans(vsk)->queue_pair_min_size =
-			vmci_trans(psk)->queue_pair_min_size;
-		vmci_trans(vsk)->queue_pair_max_size =
-			vmci_trans(psk)->queue_pair_max_size;
-	} else {
-		vmci_trans(vsk)->queue_pair_size =
-			VMCI_TRANSPORT_DEFAULT_QP_SIZE;
-		vmci_trans(vsk)->queue_pair_min_size =
-			 VMCI_TRANSPORT_DEFAULT_QP_SIZE_MIN;
-		vmci_trans(vsk)->queue_pair_max_size =
-			VMCI_TRANSPORT_DEFAULT_QP_SIZE_MAX;
-	}
 
 	return 0;
 }
@@ -1813,8 +1793,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk)
 
 	if (vmci_transport_old_proto_override(&old_pkt_proto) &&
 		old_pkt_proto) {
-		err = vmci_transport_send_conn_request(
-			sk, vmci_trans(vsk)->queue_pair_size);
+		err = vmci_transport_send_conn_request(sk, vsk->buffer_size);
 		if (err < 0) {
 			sk->sk_state = TCP_CLOSE;
 			return err;
@@ -1822,8 +1801,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk)
 	} else {
 		int supported_proto_versions =
 			vmci_transport_new_proto_supported_versions();
-		err = vmci_transport_send_conn_request2(
-				sk, vmci_trans(vsk)->queue_pair_size,
+		err = vmci_transport_send_conn_request2(sk, vsk->buffer_size,
 				supported_proto_versions);
 		if (err < 0) {
 			sk->sk_state = TCP_CLOSE;
@@ -1876,46 +1854,6 @@ static bool vmci_transport_stream_is_active(struct vsock_sock *vsk)
 	return !vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle);
 }
 
-static u64 vmci_transport_get_buffer_size(struct vsock_sock *vsk)
-{
-	return vmci_trans(vsk)->queue_pair_size;
-}
-
-static u64 vmci_transport_get_min_buffer_size(struct vsock_sock *vsk)
-{
-	return vmci_trans(vsk)->queue_pair_min_size;
-}
-
-static u64 vmci_transport_get_max_buffer_size(struct vsock_sock *vsk)
-{
-	return vmci_trans(vsk)->queue_pair_max_size;
-}
-
-static void vmci_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
-{
-	if (val < vmci_trans(vsk)->queue_pair_min_size)
-		vmci_trans(vsk)->queue_pair_min_size = val;
-	if (val > vmci_trans(vsk)->queue_pair_max_size)
-		vmci_trans(vsk)->queue_pair_max_size = val;
-	vmci_trans(vsk)->queue_pair_size = val;
-}
-
-static void vmci_transport_set_min_buffer_size(struct vsock_sock *vsk,
-					       u64 val)
-{
-	if (val > vmci_trans(vsk)->queue_pair_size)
-		vmci_trans(vsk)->queue_pair_size = val;
-	vmci_trans(vsk)->queue_pair_min_size = val;
-}
-
-static void vmci_transport_set_max_buffer_size(struct vsock_sock *vsk,
-					       u64 val)
-{
-	if (val < vmci_trans(vsk)->queue_pair_size)
-		vmci_trans(vsk)->queue_pair_size = val;
-	vmci_trans(vsk)->queue_pair_max_size = val;
-}
-
 static int vmci_transport_notify_poll_in(
 	struct vsock_sock *vsk,
 	size_t target,
@@ -2098,12 +2036,6 @@ static const struct vsock_transport vmci_transport = {
 	.notify_send_pre_enqueue = vmci_transport_notify_send_pre_enqueue,
 	.notify_send_post_enqueue = vmci_transport_notify_send_post_enqueue,
 	.shutdown = vmci_transport_shutdown,
-	.set_buffer_size = vmci_transport_set_buffer_size,
-	.set_min_buffer_size = vmci_transport_set_min_buffer_size,
-	.set_max_buffer_size = vmci_transport_set_max_buffer_size,
-	.get_buffer_size = vmci_transport_get_buffer_size,
-	.get_min_buffer_size = vmci_transport_get_min_buffer_size,
-	.get_max_buffer_size = vmci_transport_get_max_buffer_size,
 	.get_local_cid = vmci_transport_get_local_cid,
 };
 
diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h
index 1ca1e8640b31..b7b072194282 100644
--- a/net/vmw_vsock/vmci_transport.h
+++ b/net/vmw_vsock/vmci_transport.h
@@ -108,9 +108,6 @@ struct vmci_transport {
 	struct vmci_qp *qpair;
 	u64 produce_size;
 	u64 consume_size;
-	u64 queue_pair_size;
-	u64 queue_pair_min_size;
-	u64 queue_pair_max_size;
 	u32 detach_sub_id;
 	union vmci_transport_notify notify;
 	const struct vmci_transport_notify_ops *notify_ops;
-- 
cgit v1.2.3-59-g8ed1b


From b9ca2f5ff7784d46285a8f1b14419ac4645096f7 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:43 +0100
Subject: vsock: add vsock_create_connected() called by transports

All transports call __vsock_create() with the same parameters,
most of them depending on the parent socket. In order to simplify
the VSOCK core APIs exposed to the transports, this patch adds
the vsock_create_connected() callable from transports to create
a new socket when a connection request is received.
We also unexported the __vsock_create().

Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_vsock.h                  |  5 +----
 net/vmw_vsock/af_vsock.c                | 20 +++++++++++++-------
 net/vmw_vsock/hyperv_transport.c        |  3 +--
 net/vmw_vsock/virtio_transport_common.c |  3 +--
 net/vmw_vsock/vmci_transport.c          |  3 +--
 5 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 4b5d16840fd4..fa1570dc9f5c 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -76,10 +76,7 @@ struct vsock_sock {
 
 s64 vsock_stream_has_data(struct vsock_sock *vsk);
 s64 vsock_stream_has_space(struct vsock_sock *vsk);
-struct sock *__vsock_create(struct net *net,
-			    struct socket *sock,
-			    struct sock *parent,
-			    gfp_t priority, unsigned short type, int kern);
+struct sock *vsock_create_connected(struct sock *parent);
 
 /**** TRANSPORT ****/
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 11b88094e3b2..7c11ac1bc542 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -567,12 +567,12 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
 
 static void vsock_connect_timeout(struct work_struct *work);
 
-struct sock *__vsock_create(struct net *net,
-			    struct socket *sock,
-			    struct sock *parent,
-			    gfp_t priority,
-			    unsigned short type,
-			    int kern)
+static struct sock *__vsock_create(struct net *net,
+				   struct socket *sock,
+				   struct sock *parent,
+				   gfp_t priority,
+				   unsigned short type,
+				   int kern)
 {
 	struct sock *sk;
 	struct vsock_sock *psk;
@@ -639,7 +639,6 @@ struct sock *__vsock_create(struct net *net,
 
 	return sk;
 }
-EXPORT_SYMBOL_GPL(__vsock_create);
 
 static void __vsock_release(struct sock *sk, int level)
 {
@@ -703,6 +702,13 @@ static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return err;
 }
 
+struct sock *vsock_create_connected(struct sock *parent)
+{
+	return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL,
+			      parent->sk_type, 0);
+}
+EXPORT_SYMBOL_GPL(vsock_create_connected);
+
 s64 vsock_stream_has_data(struct vsock_sock *vsk)
 {
 	return vsk->transport->stream_has_data(vsk);
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index ab947561543e..7d0a972a1428 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -360,8 +360,7 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 		if (sk->sk_ack_backlog >= sk->sk_max_ack_backlog)
 			goto out;
 
-		new = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
-				     sk->sk_type, 0);
+		new = vsock_create_connected(sk);
 		if (!new)
 			goto out;
 
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index d4a0bf19aa98..b7b1a98e478e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1004,8 +1004,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
 		return -ENOMEM;
 	}
 
-	child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
-			       sk->sk_type, 0);
+	child = vsock_create_connected(sk);
 	if (!child) {
 		virtio_transport_reset(vsk, pkt);
 		return -ENOMEM;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 608bb6bd79aa..b6c8c9cc8d72 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1004,8 +1004,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
 		return -ECONNREFUSED;
 	}
 
-	pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
-				 sk->sk_type, 0);
+	pending = vsock_create_connected(sk);
 	if (!pending) {
 		vmci_transport_send_reset(sk, pkt);
 		return -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From c0cfa2d8a788fcf45df5bf4070ab2474c88d543a Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:46 +0100
Subject: vsock: add multi-transports support

This patch adds the support of multiple transports in the
VSOCK core.

With the multi-transports support, we can use vsock with nested VMs
(using also different hypervisors) loading both guest->host and
host->guest transports at the same time.

Major changes:
- vsock core module can be loaded regardless of the transports
- vsock_core_init() and vsock_core_exit() are renamed to
  vsock_core_register() and vsock_core_unregister()
- vsock_core_register() has a feature parameter (H2G, G2H, DGRAM)
  to identify which directions the transport can handle and if it's
  support DGRAM (only vmci)
- each stream socket is assigned to a transport when the remote CID
  is set (during the connect() or when we receive a connection request
  on a listener socket).
  The remote CID is used to decide which transport to use:
  - remote CID <= VMADDR_CID_HOST will use guest->host transport;
  - remote CID == local_cid (guest->host transport) will use guest->host
    transport for loopback (host->guest transports don't support loopback);
  - remote CID > VMADDR_CID_HOST will use host->guest transport;
- listener sockets are not bound to any transports since no transport
  operations are done on it. In this way we can create a listener
  socket, also if the transports are not loaded or with VMADDR_CID_ANY
  to listen on all transports.
- DGRAM sockets are handled as before, since only the vmci_transport
  provides this feature.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vsock.c                   |   5 +-
 include/net/af_vsock.h                  |  18 ++-
 net/vmw_vsock/af_vsock.c                | 243 ++++++++++++++++++++++++--------
 net/vmw_vsock/hyperv_transport.c        |  26 +++-
 net/vmw_vsock/virtio_transport.c        |   7 +-
 net/vmw_vsock/virtio_transport_common.c |  63 ++++++---
 net/vmw_vsock/vmci_transport.c          |  32 ++++-
 7 files changed, 297 insertions(+), 97 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 6d7e4f022748..b235f4bbe8ea 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -831,7 +831,8 @@ static int __init vhost_vsock_init(void)
 {
 	int ret;
 
-	ret = vsock_core_init(&vhost_transport.transport);
+	ret = vsock_core_register(&vhost_transport.transport,
+				  VSOCK_TRANSPORT_F_H2G);
 	if (ret < 0)
 		return ret;
 	return misc_register(&vhost_vsock_misc);
@@ -840,7 +841,7 @@ static int __init vhost_vsock_init(void)
 static void __exit vhost_vsock_exit(void)
 {
 	misc_deregister(&vhost_vsock_misc);
-	vsock_core_exit();
+	vsock_core_unregister(&vhost_transport.transport);
 };
 
 module_init(vhost_vsock_init);
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index fa1570dc9f5c..cf5c3691251b 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -91,6 +91,14 @@ struct vsock_transport_send_notify_data {
 	u64 data2; /* Transport-defined. */
 };
 
+/* Transport features flags */
+/* Transport provides host->guest communication */
+#define VSOCK_TRANSPORT_F_H2G		0x00000001
+/* Transport provides guest->host communication */
+#define VSOCK_TRANSPORT_F_G2H		0x00000002
+/* Transport provides DGRAM communication */
+#define VSOCK_TRANSPORT_F_DGRAM		0x00000004
+
 struct vsock_transport {
 	/* Initialize/tear-down socket. */
 	int (*init)(struct vsock_sock *, struct vsock_sock *);
@@ -154,12 +162,8 @@ struct vsock_transport {
 
 /**** CORE ****/
 
-int __vsock_core_init(const struct vsock_transport *t, struct module *owner);
-static inline int vsock_core_init(const struct vsock_transport *t)
-{
-	return __vsock_core_init(t, THIS_MODULE);
-}
-void vsock_core_exit(void);
+int vsock_core_register(const struct vsock_transport *t, int features);
+void vsock_core_unregister(const struct vsock_transport *t);
 
 /* The transport may downcast this to access transport-specific functions */
 const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk);
@@ -190,6 +194,8 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
 					 struct sockaddr_vm *dst);
 void vsock_remove_sock(struct vsock_sock *vsk);
 void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
+int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk);
+bool vsock_find_cid(unsigned int cid);
 
 /**** TAP ****/
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 8985d9d417f0..5357714b6104 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -130,7 +130,12 @@ static struct proto vsock_proto = {
 #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256)
 #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128
 
-static const struct vsock_transport *transport_single;
+/* Transport used for host->guest communication */
+static const struct vsock_transport *transport_h2g;
+/* Transport used for guest->host communication */
+static const struct vsock_transport *transport_g2h;
+/* Transport used for DGRAM communication */
+static const struct vsock_transport *transport_dgram;
 static DEFINE_MUTEX(vsock_register_mutex);
 
 /**** UTILS ****/
@@ -182,7 +187,7 @@ static int vsock_auto_bind(struct vsock_sock *vsk)
 	return __vsock_bind(sk, &local_addr);
 }
 
-static int __init vsock_init_tables(void)
+static void vsock_init_tables(void)
 {
 	int i;
 
@@ -191,7 +196,6 @@ static int __init vsock_init_tables(void)
 
 	for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++)
 		INIT_LIST_HEAD(&vsock_connected_table[i]);
-	return 0;
 }
 
 static void __vsock_insert_bound(struct list_head *list,
@@ -376,6 +380,68 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
 }
 EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
 
+/* Assign a transport to a socket and call the .init transport callback.
+ *
+ * Note: for stream socket this must be called when vsk->remote_addr is set
+ * (e.g. during the connect() or when a connection request on a listener
+ * socket is received).
+ * The vsk->remote_addr is used to decide which transport to use:
+ *  - remote CID <= VMADDR_CID_HOST will use guest->host transport;
+ *  - remote CID == local_cid (guest->host transport) will use guest->host
+ *    transport for loopback (host->guest transports don't support loopback);
+ *  - remote CID > VMADDR_CID_HOST will use host->guest transport;
+ */
+int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
+{
+	const struct vsock_transport *new_transport;
+	struct sock *sk = sk_vsock(vsk);
+	unsigned int remote_cid = vsk->remote_addr.svm_cid;
+
+	switch (sk->sk_type) {
+	case SOCK_DGRAM:
+		new_transport = transport_dgram;
+		break;
+	case SOCK_STREAM:
+		if (remote_cid <= VMADDR_CID_HOST ||
+		    (transport_g2h &&
+		     remote_cid == transport_g2h->get_local_cid()))
+			new_transport = transport_g2h;
+		else
+			new_transport = transport_h2g;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	if (vsk->transport) {
+		if (vsk->transport == new_transport)
+			return 0;
+
+		vsk->transport->release(vsk);
+		vsk->transport->destruct(vsk);
+	}
+
+	if (!new_transport)
+		return -ENODEV;
+
+	vsk->transport = new_transport;
+
+	return vsk->transport->init(vsk, psk);
+}
+EXPORT_SYMBOL_GPL(vsock_assign_transport);
+
+bool vsock_find_cid(unsigned int cid)
+{
+	if (transport_g2h && cid == transport_g2h->get_local_cid())
+		return true;
+
+	if (transport_h2g && cid == VMADDR_CID_HOST)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(vsock_find_cid);
+
 static struct sock *vsock_dequeue_accept(struct sock *listener)
 {
 	struct vsock_sock *vlistener;
@@ -414,6 +480,9 @@ static int vsock_send_shutdown(struct sock *sk, int mode)
 {
 	struct vsock_sock *vsk = vsock_sk(sk);
 
+	if (!vsk->transport)
+		return -ENODEV;
+
 	return vsk->transport->shutdown(vsk, mode);
 }
 
@@ -530,7 +599,6 @@ static int __vsock_bind_dgram(struct vsock_sock *vsk,
 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
 {
 	struct vsock_sock *vsk = vsock_sk(sk);
-	u32 cid;
 	int retval;
 
 	/* First ensure this socket isn't already bound. */
@@ -540,10 +608,9 @@ static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr)
 	/* Now bind to the provided address or select appropriate values if
 	 * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY).  Note that
 	 * like AF_INET prevents binding to a non-local IP address (in most
-	 * cases), we only allow binding to the local CID.
+	 * cases), we only allow binding to a local CID.
 	 */
-	cid = vsk->transport->get_local_cid();
-	if (addr->svm_cid != cid && addr->svm_cid != VMADDR_CID_ANY)
+	if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid))
 		return -EADDRNOTAVAIL;
 
 	switch (sk->sk_socket->type) {
@@ -592,7 +659,6 @@ static struct sock *__vsock_create(struct net *net,
 		sk->sk_type = type;
 
 	vsk = vsock_sk(sk);
-	vsk->transport = transport_single;
 	vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
 	vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY);
 
@@ -629,11 +695,6 @@ static struct sock *__vsock_create(struct net *net,
 		vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE;
 	}
 
-	if (vsk->transport->init(vsk, psk) < 0) {
-		sk_free(sk);
-		return NULL;
-	}
-
 	return sk;
 }
 
@@ -649,7 +710,10 @@ static void __vsock_release(struct sock *sk, int level)
 		/* The release call is supposed to use lock_sock_nested()
 		 * rather than lock_sock(), if a sock lock should be acquired.
 		 */
-		vsk->transport->release(vsk);
+		if (vsk->transport)
+			vsk->transport->release(vsk);
+		else if (sk->sk_type == SOCK_STREAM)
+			vsock_remove_sock(vsk);
 
 		/* When "level" is SINGLE_DEPTH_NESTING, use the nested
 		 * version to avoid the warning "possible recursive locking
@@ -677,7 +741,8 @@ static void vsock_sk_destruct(struct sock *sk)
 {
 	struct vsock_sock *vsk = vsock_sk(sk);
 
-	vsk->transport->destruct(vsk);
+	if (vsk->transport)
+		vsk->transport->destruct(vsk);
 
 	/* When clearing these addresses, there's no need to set the family and
 	 * possibly register the address family with the kernel.
@@ -894,7 +959,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 			mask |= EPOLLIN | EPOLLRDNORM;
 
 		/* If there is something in the queue then we can read. */
-		if (transport->stream_is_active(vsk) &&
+		if (transport && transport->stream_is_active(vsk) &&
 		    !(sk->sk_shutdown & RCV_SHUTDOWN)) {
 			bool data_ready_now = false;
 			int ret = transport->notify_poll_in(
@@ -1144,7 +1209,6 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 	err = 0;
 	sk = sock->sk;
 	vsk = vsock_sk(sk);
-	transport = vsk->transport;
 
 	lock_sock(sk);
 
@@ -1172,19 +1236,26 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
 			goto out;
 		}
 
+		/* Set the remote address that we are connecting to. */
+		memcpy(&vsk->remote_addr, remote_addr,
+		       sizeof(vsk->remote_addr));
+
+		err = vsock_assign_transport(vsk, NULL);
+		if (err)
+			goto out;
+
+		transport = vsk->transport;
+
 		/* The hypervisor and well-known contexts do not have socket
 		 * endpoints.
 		 */
-		if (!transport->stream_allow(remote_addr->svm_cid,
+		if (!transport ||
+		    !transport->stream_allow(remote_addr->svm_cid,
 					     remote_addr->svm_port)) {
 			err = -ENETUNREACH;
 			goto out;
 		}
 
-		/* Set the remote address that we are connecting to. */
-		memcpy(&vsk->remote_addr, remote_addr,
-		       sizeof(vsk->remote_addr));
-
 		err = vsock_auto_bind(vsk);
 		if (err)
 			goto out;
@@ -1584,7 +1655,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
-	if (sk->sk_state != TCP_ESTABLISHED ||
+	if (!transport || sk->sk_state != TCP_ESTABLISHED ||
 	    !vsock_addr_bound(&vsk->local_addr)) {
 		err = -ENOTCONN;
 		goto out;
@@ -1710,7 +1781,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	lock_sock(sk);
 
-	if (sk->sk_state != TCP_ESTABLISHED) {
+	if (!transport || sk->sk_state != TCP_ESTABLISHED) {
 		/* Recvmsg is supposed to return 0 if a peer performs an
 		 * orderly shutdown. Differentiate between that case and when a
 		 * peer has not connected or a local shutdown occured with the
@@ -1884,7 +1955,9 @@ static const struct proto_ops vsock_stream_ops = {
 static int vsock_create(struct net *net, struct socket *sock,
 			int protocol, int kern)
 {
+	struct vsock_sock *vsk;
 	struct sock *sk;
+	int ret;
 
 	if (!sock)
 		return -EINVAL;
@@ -1909,7 +1982,17 @@ static int vsock_create(struct net *net, struct socket *sock,
 	if (!sk)
 		return -ENOMEM;
 
-	vsock_insert_unbound(vsock_sk(sk));
+	vsk = vsock_sk(sk);
+
+	if (sock->type == SOCK_DGRAM) {
+		ret = vsock_assign_transport(vsk, NULL);
+		if (ret < 0) {
+			sock_put(sk);
+			return ret;
+		}
+	}
+
+	vsock_insert_unbound(vsk);
 
 	return 0;
 }
@@ -1924,11 +2007,20 @@ static long vsock_dev_do_ioctl(struct file *filp,
 			       unsigned int cmd, void __user *ptr)
 {
 	u32 __user *p = ptr;
+	u32 cid = VMADDR_CID_ANY;
 	int retval = 0;
 
 	switch (cmd) {
 	case IOCTL_VM_SOCKETS_GET_LOCAL_CID:
-		if (put_user(transport_single->get_local_cid(), p) != 0)
+		/* To be compatible with the VMCI behavior, we prioritize the
+		 * guest CID instead of well-know host CID (VMADDR_CID_HOST).
+		 */
+		if (transport_g2h)
+			cid = transport_g2h->get_local_cid();
+		else if (transport_h2g)
+			cid = transport_h2g->get_local_cid();
+
+		if (put_user(cid, p) != 0)
 			retval = -EFAULT;
 		break;
 
@@ -1968,24 +2060,13 @@ static struct miscdevice vsock_device = {
 	.fops		= &vsock_device_ops,
 };
 
-int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
+static int __init vsock_init(void)
 {
-	int err = mutex_lock_interruptible(&vsock_register_mutex);
+	int err = 0;
 
-	if (err)
-		return err;
-
-	if (transport_single) {
-		err = -EBUSY;
-		goto err_busy;
-	}
-
-	/* Transport must be the owner of the protocol so that it can't
-	 * unload while there are open sockets.
-	 */
-	vsock_proto.owner = owner;
-	transport_single = t;
+	vsock_init_tables();
 
+	vsock_proto.owner = THIS_MODULE;
 	vsock_device.minor = MISC_DYNAMIC_MINOR;
 	err = misc_register(&vsock_device);
 	if (err) {
@@ -2006,7 +2087,6 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner)
 		goto err_unregister_proto;
 	}
 
-	mutex_unlock(&vsock_register_mutex);
 	return 0;
 
 err_unregister_proto:
@@ -2014,28 +2094,15 @@ err_unregister_proto:
 err_deregister_misc:
 	misc_deregister(&vsock_device);
 err_reset_transport:
-	transport_single = NULL;
-err_busy:
-	mutex_unlock(&vsock_register_mutex);
 	return err;
 }
-EXPORT_SYMBOL_GPL(__vsock_core_init);
 
-void vsock_core_exit(void)
+static void __exit vsock_exit(void)
 {
-	mutex_lock(&vsock_register_mutex);
-
 	misc_deregister(&vsock_device);
 	sock_unregister(AF_VSOCK);
 	proto_unregister(&vsock_proto);
-
-	/* We do not want the assignment below re-ordered. */
-	mb();
-	transport_single = NULL;
-
-	mutex_unlock(&vsock_register_mutex);
 }
-EXPORT_SYMBOL_GPL(vsock_core_exit);
 
 const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk)
 {
@@ -2043,12 +2110,70 @@ const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk)
 }
 EXPORT_SYMBOL_GPL(vsock_core_get_transport);
 
-static void __exit vsock_exit(void)
+int vsock_core_register(const struct vsock_transport *t, int features)
+{
+	const struct vsock_transport *t_h2g, *t_g2h, *t_dgram;
+	int err = mutex_lock_interruptible(&vsock_register_mutex);
+
+	if (err)
+		return err;
+
+	t_h2g = transport_h2g;
+	t_g2h = transport_g2h;
+	t_dgram = transport_dgram;
+
+	if (features & VSOCK_TRANSPORT_F_H2G) {
+		if (t_h2g) {
+			err = -EBUSY;
+			goto err_busy;
+		}
+		t_h2g = t;
+	}
+
+	if (features & VSOCK_TRANSPORT_F_G2H) {
+		if (t_g2h) {
+			err = -EBUSY;
+			goto err_busy;
+		}
+		t_g2h = t;
+	}
+
+	if (features & VSOCK_TRANSPORT_F_DGRAM) {
+		if (t_dgram) {
+			err = -EBUSY;
+			goto err_busy;
+		}
+		t_dgram = t;
+	}
+
+	transport_h2g = t_h2g;
+	transport_g2h = t_g2h;
+	transport_dgram = t_dgram;
+
+err_busy:
+	mutex_unlock(&vsock_register_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(vsock_core_register);
+
+void vsock_core_unregister(const struct vsock_transport *t)
 {
-	/* Do nothing.  This function makes this module removable. */
+	mutex_lock(&vsock_register_mutex);
+
+	if (transport_h2g == t)
+		transport_h2g = NULL;
+
+	if (transport_g2h == t)
+		transport_g2h = NULL;
+
+	if (transport_dgram == t)
+		transport_dgram = NULL;
+
+	mutex_unlock(&vsock_register_mutex);
 }
+EXPORT_SYMBOL_GPL(vsock_core_unregister);
 
-module_init(vsock_init_tables);
+module_init(vsock_init);
 module_exit(vsock_exit);
 
 MODULE_AUTHOR("VMware, Inc.");
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 22b608805a91..1c9e65d7d94d 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -165,6 +165,8 @@ static const guid_t srv_id_template =
 	GUID_INIT(0x00000000, 0xfacb, 0x11e6, 0xbd, 0x58,
 		  0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3);
 
+static bool hvs_check_transport(struct vsock_sock *vsk);
+
 static bool is_valid_srv_id(const guid_t *id)
 {
 	return !memcmp(&id->b[4], &srv_id_template.b[4], sizeof(guid_t) - 4);
@@ -367,6 +369,18 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 
 		new->sk_state = TCP_SYN_SENT;
 		vnew = vsock_sk(new);
+
+		hvs_addr_init(&vnew->local_addr, if_type);
+		hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr);
+
+		ret = vsock_assign_transport(vnew, vsock_sk(sk));
+		/* Transport assigned (looking at remote_addr) must be the
+		 * same where we received the request.
+		 */
+		if (ret || !hvs_check_transport(vnew)) {
+			sock_put(new);
+			goto out;
+		}
 		hvs_new = vnew->trans;
 		hvs_new->chan = chan;
 	} else {
@@ -430,9 +444,6 @@ static void hvs_open_connection(struct vmbus_channel *chan)
 		new->sk_state = TCP_ESTABLISHED;
 		sk_acceptq_added(sk);
 
-		hvs_addr_init(&vnew->local_addr, if_type);
-		hvs_remote_addr_init(&vnew->remote_addr, &vnew->local_addr);
-
 		hvs_new->vm_srv_id = *if_type;
 		hvs_new->host_srv_id = *if_instance;
 
@@ -880,6 +891,11 @@ static struct vsock_transport hvs_transport = {
 
 };
 
+static bool hvs_check_transport(struct vsock_sock *vsk)
+{
+	return vsk->transport == &hvs_transport;
+}
+
 static int hvs_probe(struct hv_device *hdev,
 		     const struct hv_vmbus_device_id *dev_id)
 {
@@ -928,7 +944,7 @@ static int __init hvs_init(void)
 	if (ret != 0)
 		return ret;
 
-	ret = vsock_core_init(&hvs_transport);
+	ret = vsock_core_register(&hvs_transport, VSOCK_TRANSPORT_F_G2H);
 	if (ret) {
 		vmbus_driver_unregister(&hvs_drv);
 		return ret;
@@ -939,7 +955,7 @@ static int __init hvs_init(void)
 
 static void __exit hvs_exit(void)
 {
-	vsock_core_exit();
+	vsock_core_unregister(&hvs_transport);
 	vmbus_driver_unregister(&hvs_drv);
 }
 
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index fb1fc7760e8c..83ad85050384 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -770,7 +770,8 @@ static int __init virtio_vsock_init(void)
 	if (!virtio_vsock_workqueue)
 		return -ENOMEM;
 
-	ret = vsock_core_init(&virtio_transport.transport);
+	ret = vsock_core_register(&virtio_transport.transport,
+				  VSOCK_TRANSPORT_F_G2H);
 	if (ret)
 		goto out_wq;
 
@@ -781,7 +782,7 @@ static int __init virtio_vsock_init(void)
 	return 0;
 
 out_vci:
-	vsock_core_exit();
+	vsock_core_unregister(&virtio_transport.transport);
 out_wq:
 	destroy_workqueue(virtio_vsock_workqueue);
 	return ret;
@@ -790,7 +791,7 @@ out_wq:
 static void __exit virtio_vsock_exit(void)
 {
 	unregister_virtio_driver(&virtio_vsock_driver);
-	vsock_core_exit();
+	vsock_core_unregister(&virtio_transport.transport);
 	destroy_workqueue(virtio_vsock_workqueue);
 }
 
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index b7b1a98e478e..e5ea29c6bca7 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -453,7 +453,7 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk,
 
 	vsk->trans = vvs;
 	vvs->vsk = vsk;
-	if (psk) {
+	if (psk && psk->trans) {
 		struct virtio_vsock_sock *ptrans = psk->trans;
 
 		vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
@@ -986,13 +986,39 @@ virtio_transport_send_response(struct vsock_sock *vsk,
 	return virtio_transport_send_pkt_info(vsk, &info);
 }
 
+static bool virtio_transport_space_update(struct sock *sk,
+					  struct virtio_vsock_pkt *pkt)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	bool space_available;
+
+	/* Listener sockets are not associated with any transport, so we are
+	 * not able to take the state to see if there is space available in the
+	 * remote peer, but since they are only used to receive requests, we
+	 * can assume that there is always space available in the other peer.
+	 */
+	if (!vvs)
+		return true;
+
+	/* buf_alloc and fwd_cnt is always included in the hdr */
+	spin_lock_bh(&vvs->tx_lock);
+	vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+	vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+	space_available = virtio_transport_has_space(vsk);
+	spin_unlock_bh(&vvs->tx_lock);
+	return space_available;
+}
+
 /* Handle server socket */
 static int
-virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt,
+			     struct virtio_transport *t)
 {
 	struct vsock_sock *vsk = vsock_sk(sk);
 	struct vsock_sock *vchild;
 	struct sock *child;
+	int ret;
 
 	if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
 		virtio_transport_reset(vsk, pkt);
@@ -1022,6 +1048,20 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
 	vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
 			le32_to_cpu(pkt->hdr.src_port));
 
+	ret = vsock_assign_transport(vchild, vsk);
+	/* Transport assigned (looking at remote_addr) must be the same
+	 * where we received the request.
+	 */
+	if (ret || vchild->transport != &t->transport) {
+		release_sock(child);
+		virtio_transport_reset(vsk, pkt);
+		sock_put(child);
+		return ret;
+	}
+
+	if (virtio_transport_space_update(child, pkt))
+		child->sk_write_space(child);
+
 	vsock_insert_connected(vchild);
 	vsock_enqueue_accept(sk, child);
 	virtio_transport_send_response(vchild, pkt);
@@ -1032,22 +1072,6 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
 	return 0;
 }
 
-static bool virtio_transport_space_update(struct sock *sk,
-					  struct virtio_vsock_pkt *pkt)
-{
-	struct vsock_sock *vsk = vsock_sk(sk);
-	struct virtio_vsock_sock *vvs = vsk->trans;
-	bool space_available;
-
-	/* buf_alloc and fwd_cnt is always included in the hdr */
-	spin_lock_bh(&vvs->tx_lock);
-	vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
-	vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
-	space_available = virtio_transport_has_space(vsk);
-	spin_unlock_bh(&vvs->tx_lock);
-	return space_available;
-}
-
 /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
  * lock.
  */
@@ -1104,7 +1128,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
 
 	switch (sk->sk_state) {
 	case TCP_LISTEN:
-		virtio_transport_recv_listen(sk, pkt);
+		virtio_transport_recv_listen(sk, pkt, t);
 		virtio_transport_free_pkt(pkt);
 		break;
 	case TCP_SYN_SENT:
@@ -1122,6 +1146,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
 		virtio_transport_free_pkt(pkt);
 		break;
 	}
+
 	release_sock(sk);
 
 	/* Release refcnt obtained when we fetched this socket out of the
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index b6c8c9cc8d72..86030ecb53dd 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -57,6 +57,7 @@ static bool vmci_transport_old_proto_override(bool *old_pkt_proto);
 static u16 vmci_transport_new_proto_supported_versions(void);
 static bool vmci_transport_proto_to_notify_struct(struct sock *sk, u16 *proto,
 						  bool old_pkt_proto);
+static bool vmci_check_transport(struct vsock_sock *vsk);
 
 struct vmci_transport_recv_pkt_info {
 	struct work_struct work;
@@ -1017,6 +1018,16 @@ static int vmci_transport_recv_listen(struct sock *sk,
 	vsock_addr_init(&vpending->remote_addr, pkt->dg.src.context,
 			pkt->src_port);
 
+	err = vsock_assign_transport(vpending, vsock_sk(sk));
+	/* Transport assigned (looking at remote_addr) must be the same
+	 * where we received the request.
+	 */
+	if (err || !vmci_check_transport(vpending)) {
+		vmci_transport_send_reset(sk, pkt);
+		sock_put(pending);
+		return err;
+	}
+
 	/* If the proposed size fits within our min/max, accept it. Otherwise
 	 * propose our own size.
 	 */
@@ -2008,7 +2019,7 @@ static u32 vmci_transport_get_local_cid(void)
 	return vmci_get_context_id();
 }
 
-static const struct vsock_transport vmci_transport = {
+static struct vsock_transport vmci_transport = {
 	.init = vmci_transport_socket_init,
 	.destruct = vmci_transport_destruct,
 	.release = vmci_transport_release,
@@ -2038,10 +2049,25 @@ static const struct vsock_transport vmci_transport = {
 	.get_local_cid = vmci_transport_get_local_cid,
 };
 
+static bool vmci_check_transport(struct vsock_sock *vsk)
+{
+	return vsk->transport == &vmci_transport;
+}
+
 static int __init vmci_transport_init(void)
 {
+	int features = VSOCK_TRANSPORT_F_DGRAM | VSOCK_TRANSPORT_F_H2G;
+	int cid;
 	int err;
 
+	cid = vmci_get_context_id();
+
+	if (cid == VMCI_INVALID_ID)
+		return -EINVAL;
+
+	if (cid != VMCI_HOST_CONTEXT_ID)
+		features |= VSOCK_TRANSPORT_F_G2H;
+
 	/* Create the datagram handle that we will use to send and receive all
 	 * VSocket control messages for this context.
 	 */
@@ -2065,7 +2091,7 @@ static int __init vmci_transport_init(void)
 		goto err_destroy_stream_handle;
 	}
 
-	err = vsock_core_init(&vmci_transport);
+	err = vsock_core_register(&vmci_transport, features);
 	if (err < 0)
 		goto err_unsubscribe;
 
@@ -2096,7 +2122,7 @@ static void __exit vmci_transport_exit(void)
 		vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
 	}
 
-	vsock_core_exit();
+	vsock_core_unregister(&vmci_transport);
 }
 module_exit(vmci_transport_exit);
 
-- 
cgit v1.2.3-59-g8ed1b


From b1bba80a4376aef34de2b57bfb8834bd095703ed Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:47 +0100
Subject: vsock/vmci: register vmci_transport only when VMCI guest/host are
 active

To allow other transports to be loaded with vmci_transport,
we register the vmci_transport as G2H or H2G only when a VMCI guest
or host is active.

To do that, this patch adds a callback registered in the vmci driver
that will be called when the host or guest becomes active.
This callback will register the vmci_transport in the VSOCK core.

Cc: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/misc/vmw_vmci/vmci_driver.c | 67 +++++++++++++++++++++++++++++++++++++
 drivers/misc/vmw_vmci/vmci_driver.h |  2 ++
 drivers/misc/vmw_vmci/vmci_guest.c  |  2 ++
 drivers/misc/vmw_vmci/vmci_host.c   |  7 ++++
 include/linux/vmw_vmci_api.h        |  2 ++
 net/vmw_vsock/vmci_transport.c      | 33 ++++++++++++------
 6 files changed, 102 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/vmw_vmci/vmci_driver.c b/drivers/misc/vmw_vmci/vmci_driver.c
index 819e35995d32..95fed4664a2d 100644
--- a/drivers/misc/vmw_vmci/vmci_driver.c
+++ b/drivers/misc/vmw_vmci/vmci_driver.c
@@ -28,6 +28,10 @@ MODULE_PARM_DESC(disable_guest,
 static bool vmci_guest_personality_initialized;
 static bool vmci_host_personality_initialized;
 
+static DEFINE_MUTEX(vmci_vsock_mutex); /* protects vmci_vsock_transport_cb */
+static vmci_vsock_cb vmci_vsock_transport_cb;
+bool vmci_vsock_cb_host_called;
+
 /*
  * vmci_get_context_id() - Gets the current context ID.
  *
@@ -45,6 +49,69 @@ u32 vmci_get_context_id(void)
 }
 EXPORT_SYMBOL_GPL(vmci_get_context_id);
 
+/*
+ * vmci_register_vsock_callback() - Register the VSOCK vmci_transport callback.
+ *
+ * The callback will be called when the first host or guest becomes active,
+ * or if they are already active when this function is called.
+ * To unregister the callback, call this function with NULL parameter.
+ *
+ * Returns 0 on success. -EBUSY if a callback is already registered.
+ */
+int vmci_register_vsock_callback(vmci_vsock_cb callback)
+{
+	int err = 0;
+
+	mutex_lock(&vmci_vsock_mutex);
+
+	if (vmci_vsock_transport_cb && callback) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	vmci_vsock_transport_cb = callback;
+
+	if (!vmci_vsock_transport_cb) {
+		vmci_vsock_cb_host_called = false;
+		goto out;
+	}
+
+	if (vmci_guest_code_active())
+		vmci_vsock_transport_cb(false);
+
+	if (vmci_host_users() > 0) {
+		vmci_vsock_cb_host_called = true;
+		vmci_vsock_transport_cb(true);
+	}
+
+out:
+	mutex_unlock(&vmci_vsock_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(vmci_register_vsock_callback);
+
+void vmci_call_vsock_callback(bool is_host)
+{
+	mutex_lock(&vmci_vsock_mutex);
+
+	if (!vmci_vsock_transport_cb)
+		goto out;
+
+	/* In the host, this function could be called multiple times,
+	 * but we want to register it only once.
+	 */
+	if (is_host) {
+		if (vmci_vsock_cb_host_called)
+			goto out;
+
+		vmci_vsock_cb_host_called = true;
+	}
+
+	vmci_vsock_transport_cb(is_host);
+out:
+	mutex_unlock(&vmci_vsock_mutex);
+}
+
 static int __init vmci_drv_init(void)
 {
 	int vmci_err;
diff --git a/drivers/misc/vmw_vmci/vmci_driver.h b/drivers/misc/vmw_vmci/vmci_driver.h
index aab81b67670c..990682480bf6 100644
--- a/drivers/misc/vmw_vmci/vmci_driver.h
+++ b/drivers/misc/vmw_vmci/vmci_driver.h
@@ -36,10 +36,12 @@ extern struct pci_dev *vmci_pdev;
 
 u32 vmci_get_context_id(void);
 int vmci_send_datagram(struct vmci_datagram *dg);
+void vmci_call_vsock_callback(bool is_host);
 
 int vmci_host_init(void);
 void vmci_host_exit(void);
 bool vmci_host_code_active(void);
+int vmci_host_users(void);
 
 int vmci_guest_init(void);
 void vmci_guest_exit(void);
diff --git a/drivers/misc/vmw_vmci/vmci_guest.c b/drivers/misc/vmw_vmci/vmci_guest.c
index 7a84a48c75da..cc8eeb361fcd 100644
--- a/drivers/misc/vmw_vmci/vmci_guest.c
+++ b/drivers/misc/vmw_vmci/vmci_guest.c
@@ -637,6 +637,8 @@ static int vmci_guest_probe_device(struct pci_dev *pdev,
 		  vmci_dev->iobase + VMCI_CONTROL_ADDR);
 
 	pci_set_drvdata(pdev, vmci_dev);
+
+	vmci_call_vsock_callback(false);
 	return 0;
 
 err_free_irq:
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index 833e2bd248a5..ff3c396146ff 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -108,6 +108,11 @@ bool vmci_host_code_active(void)
 	     atomic_read(&vmci_host_active_users) > 0);
 }
 
+int vmci_host_users(void)
+{
+	return atomic_read(&vmci_host_active_users);
+}
+
 /*
  * Called on open of /dev/vmci.
  */
@@ -338,6 +343,8 @@ static int vmci_host_do_init_context(struct vmci_host_dev *vmci_host_dev,
 	vmci_host_dev->ct_type = VMCIOBJ_CONTEXT;
 	atomic_inc(&vmci_host_active_users);
 
+	vmci_call_vsock_callback(true);
+
 	retval = 0;
 
 out:
diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h
index acd9fafe4fc6..f28907345c80 100644
--- a/include/linux/vmw_vmci_api.h
+++ b/include/linux/vmw_vmci_api.h
@@ -19,6 +19,7 @@
 struct msghdr;
 typedef void (vmci_device_shutdown_fn) (void *device_registration,
 					void *user_data);
+typedef void (*vmci_vsock_cb) (bool is_host);
 
 int vmci_datagram_create_handle(u32 resource_id, u32 flags,
 				vmci_datagram_recv_cb recv_cb,
@@ -37,6 +38,7 @@ int vmci_doorbell_destroy(struct vmci_handle handle);
 int vmci_doorbell_notify(struct vmci_handle handle, u32 priv_flags);
 u32 vmci_get_context_id(void);
 bool vmci_is_context_owner(u32 context_id, kuid_t uid);
+int vmci_register_vsock_callback(vmci_vsock_cb callback);
 
 int vmci_event_subscribe(u32 event,
 			 vmci_event_cb callback, void *callback_data,
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 86030ecb53dd..d9c9c834ad6f 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -2054,19 +2054,21 @@ static bool vmci_check_transport(struct vsock_sock *vsk)
 	return vsk->transport == &vmci_transport;
 }
 
-static int __init vmci_transport_init(void)
+void vmci_vsock_transport_cb(bool is_host)
 {
-	int features = VSOCK_TRANSPORT_F_DGRAM | VSOCK_TRANSPORT_F_H2G;
-	int cid;
-	int err;
+	int features;
 
-	cid = vmci_get_context_id();
+	if (is_host)
+		features = VSOCK_TRANSPORT_F_H2G;
+	else
+		features = VSOCK_TRANSPORT_F_G2H;
 
-	if (cid == VMCI_INVALID_ID)
-		return -EINVAL;
+	vsock_core_register(&vmci_transport, features);
+}
 
-	if (cid != VMCI_HOST_CONTEXT_ID)
-		features |= VSOCK_TRANSPORT_F_G2H;
+static int __init vmci_transport_init(void)
+{
+	int err;
 
 	/* Create the datagram handle that we will use to send and receive all
 	 * VSocket control messages for this context.
@@ -2080,7 +2082,6 @@ static int __init vmci_transport_init(void)
 		pr_err("Unable to create datagram handle. (%d)\n", err);
 		return vmci_transport_error_to_vsock_error(err);
 	}
-
 	err = vmci_event_subscribe(VMCI_EVENT_QP_RESUMED,
 				   vmci_transport_qp_resumed_cb,
 				   NULL, &vmci_transport_qp_resumed_sub_id);
@@ -2091,12 +2092,21 @@ static int __init vmci_transport_init(void)
 		goto err_destroy_stream_handle;
 	}
 
-	err = vsock_core_register(&vmci_transport, features);
+	/* Register only with dgram feature, other features (H2G, G2H) will be
+	 * registered when the first host or guest becomes active.
+	 */
+	err = vsock_core_register(&vmci_transport, VSOCK_TRANSPORT_F_DGRAM);
 	if (err < 0)
 		goto err_unsubscribe;
 
+	err = vmci_register_vsock_callback(vmci_vsock_transport_cb);
+	if (err < 0)
+		goto err_unregister;
+
 	return 0;
 
+err_unregister:
+	vsock_core_unregister(&vmci_transport);
 err_unsubscribe:
 	vmci_event_unsubscribe(vmci_transport_qp_resumed_sub_id);
 err_destroy_stream_handle:
@@ -2122,6 +2132,7 @@ static void __exit vmci_transport_exit(void)
 		vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID;
 	}
 
+	vmci_register_vsock_callback(NULL);
 	vsock_core_unregister(&vmci_transport);
 }
 module_exit(vmci_transport_exit);
-- 
cgit v1.2.3-59-g8ed1b


From 6a2c0962105ae8ceba182c4f616e0e41d7755591 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 14 Nov 2019 10:57:48 +0100
Subject: vsock: prevent transport modules unloading

This patch adds 'module' member in the 'struct vsock_transport'
in order to get/put the transport module. This prevents the
module unloading while sockets are assigned to it.

We increase the module refcnt when a socket is assigned to a
transport, and we decrease the module refcnt when the socket
is destructed.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vsock.c            |  2 ++
 include/net/af_vsock.h           |  2 ++
 net/vmw_vsock/af_vsock.c         | 20 ++++++++++++++++----
 net/vmw_vsock/hyperv_transport.c |  2 ++
 net/vmw_vsock/virtio_transport.c |  2 ++
 net/vmw_vsock/vmci_transport.c   |  1 +
 6 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index b235f4bbe8ea..fdda9ec625ad 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -386,6 +386,8 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
 
 static struct virtio_transport vhost_transport = {
 	.transport = {
+		.module                   = THIS_MODULE,
+
 		.get_local_cid            = vhost_transport_get_local_cid,
 
 		.init                     = virtio_transport_do_socket_init,
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index cf5c3691251b..4206dc6d813f 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -100,6 +100,8 @@ struct vsock_transport_send_notify_data {
 #define VSOCK_TRANSPORT_F_DGRAM		0x00000004
 
 struct vsock_transport {
+	struct module *module;
+
 	/* Initialize/tear-down socket. */
 	int (*init)(struct vsock_sock *, struct vsock_sock *);
 	void (*destruct)(struct vsock_sock *);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 5357714b6104..5cb0ae42d916 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -380,6 +380,16 @@ void vsock_enqueue_accept(struct sock *listener, struct sock *connected)
 }
 EXPORT_SYMBOL_GPL(vsock_enqueue_accept);
 
+static void vsock_deassign_transport(struct vsock_sock *vsk)
+{
+	if (!vsk->transport)
+		return;
+
+	vsk->transport->destruct(vsk);
+	module_put(vsk->transport->module);
+	vsk->transport = NULL;
+}
+
 /* Assign a transport to a socket and call the .init transport callback.
  *
  * Note: for stream socket this must be called when vsk->remote_addr is set
@@ -418,10 +428,13 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
 			return 0;
 
 		vsk->transport->release(vsk);
-		vsk->transport->destruct(vsk);
+		vsock_deassign_transport(vsk);
 	}
 
-	if (!new_transport)
+	/* We increase the module refcnt to prevent the transport unloading
+	 * while there are open sockets assigned to it.
+	 */
+	if (!new_transport || !try_module_get(new_transport->module))
 		return -ENODEV;
 
 	vsk->transport = new_transport;
@@ -741,8 +754,7 @@ static void vsock_sk_destruct(struct sock *sk)
 {
 	struct vsock_sock *vsk = vsock_sk(sk);
 
-	if (vsk->transport)
-		vsk->transport->destruct(vsk);
+	vsock_deassign_transport(vsk);
 
 	/* When clearing these addresses, there's no need to set the family and
 	 * possibly register the address family with the kernel.
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 1c9e65d7d94d..3c7d07a99fc5 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -857,6 +857,8 @@ int hvs_notify_send_post_enqueue(struct vsock_sock *vsk, ssize_t written,
 }
 
 static struct vsock_transport hvs_transport = {
+	.module                   = THIS_MODULE,
+
 	.get_local_cid            = hvs_get_local_cid,
 
 	.init                     = hvs_sock_init,
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 83ad85050384..1458c5c8b64d 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -462,6 +462,8 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
 
 static struct virtio_transport virtio_transport = {
 	.transport = {
+		.module                   = THIS_MODULE,
+
 		.get_local_cid            = virtio_transport_get_local_cid,
 
 		.init                     = virtio_transport_do_socket_init,
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index d9c9c834ad6f..644d32e43d23 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -2020,6 +2020,7 @@ static u32 vmci_transport_get_local_cid(void)
 }
 
 static struct vsock_transport vmci_transport = {
+	.module = THIS_MODULE,
 	.init = vmci_transport_socket_init,
 	.destruct = vmci_transport_destruct,
 	.release = vmci_transport_release,
-- 
cgit v1.2.3-59-g8ed1b


From 42bfba9eaa33dd4af0b50b87508062a41ec26653 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 14 Nov 2019 13:02:41 +0100
Subject: net/smc: immediate termination for SMCD link groups

SMCD link group termination is called when peer signals its shutdown
of its corresponding link group. For regular shutdowns no connections
exist anymore. For abnormal shutdowns connections must be killed and
their DMBs must be unregistered immediately. That means the SMCR method
to delay the link group freeing several seconds does not fit.

This patch adds immediate termination of a link group and its SMCD
connections and makes sure all SMCD link group related cleanup steps
are finished.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/ism.h |  2 --
 include/net/smc.h      |  2 ++
 net/smc/smc_close.c    | 25 +++++++++++++++++++------
 net/smc/smc_core.c     | 46 +++++++++++++++++++++++++++++++++++++++-------
 net/smc/smc_ism.c      | 14 ++++++++++++--
 5 files changed, 72 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h
index 66eac2b9704d..1901e9c80ed8 100644
--- a/drivers/s390/net/ism.h
+++ b/drivers/s390/net/ism.h
@@ -32,8 +32,6 @@
 #define ISM_UNREG_SBA	0x11
 #define ISM_UNREG_IEQ	0x12
 
-#define ISM_ERROR	0xFFFF
-
 struct ism_req_hdr {
 	u32 cmd;
 	u16 : 16;
diff --git a/include/net/smc.h b/include/net/smc.h
index 05174ae4f325..7c2082341bb3 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -37,6 +37,8 @@ struct smcd_dmb {
 #define ISM_EVENT_GID	1
 #define ISM_EVENT_SWR	2
 
+#define ISM_ERROR	0xFFFF
+
 struct smcd_event {
 	u32 type;
 	u32 code;
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index d34e5adce2eb..d205b2114006 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -110,6 +110,17 @@ int smc_close_abort(struct smc_connection *conn)
 	return smc_cdc_get_slot_and_msg_send(conn);
 }
 
+static void smc_close_cancel_work(struct smc_sock *smc)
+{
+	struct sock *sk = &smc->sk;
+
+	release_sock(sk);
+	cancel_work_sync(&smc->conn.close_work);
+	cancel_delayed_work_sync(&smc->conn.tx_work);
+	lock_sock(sk);
+	sk->sk_state = SMC_CLOSED;
+}
+
 /* terminate smc socket abnormally - active abort
  * link group is terminated, i.e. RDMA communication no longer possible
  */
@@ -126,23 +137,21 @@ void smc_close_active_abort(struct smc_sock *smc)
 	switch (sk->sk_state) {
 	case SMC_ACTIVE:
 		sk->sk_state = SMC_PEERABORTWAIT;
-		release_sock(sk);
-		cancel_delayed_work_sync(&smc->conn.tx_work);
-		lock_sock(sk);
+		smc_close_cancel_work(smc);
 		sk->sk_state = SMC_CLOSED;
 		sock_put(sk); /* passive closing */
 		break;
 	case SMC_APPCLOSEWAIT1:
 	case SMC_APPCLOSEWAIT2:
-		release_sock(sk);
-		cancel_delayed_work_sync(&smc->conn.tx_work);
-		lock_sock(sk);
+		smc_close_cancel_work(smc);
 		sk->sk_state = SMC_CLOSED;
 		sock_put(sk); /* postponed passive closing */
 		break;
 	case SMC_PEERCLOSEWAIT1:
 	case SMC_PEERCLOSEWAIT2:
 	case SMC_PEERFINCLOSEWAIT:
+		sk->sk_state = SMC_PEERABORTWAIT;
+		smc_close_cancel_work(smc);
 		sk->sk_state = SMC_CLOSED;
 		smc_conn_free(&smc->conn);
 		release_clcsock = true;
@@ -150,7 +159,11 @@ void smc_close_active_abort(struct smc_sock *smc)
 		break;
 	case SMC_PROCESSABORT:
 	case SMC_APPFINCLOSEWAIT:
+		sk->sk_state = SMC_PEERABORTWAIT;
+		smc_close_cancel_work(smc);
 		sk->sk_state = SMC_CLOSED;
+		smc_conn_free(&smc->conn);
+		release_clcsock = true;
 		break;
 	case SMC_INIT:
 	case SMC_PEERABORTWAIT:
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 561f069b30de..9d6da2c7413d 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -214,7 +214,7 @@ static void smc_lgr_free_work(struct work_struct *work)
 
 	if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
 		smc_llc_link_inactive(lnk);
-	if (lgr->is_smcd)
+	if (lgr->is_smcd && !lgr->terminating)
 		smc_ism_signal_shutdown(lgr);
 	smc_lgr_free(lgr);
 }
@@ -381,7 +381,8 @@ void smc_conn_free(struct smc_connection *conn)
 	if (!lgr)
 		return;
 	if (lgr->is_smcd) {
-		smc_ism_unset_conn(conn);
+		if (!list_empty(&lgr->list))
+			smc_ism_unset_conn(conn);
 		tasklet_kill(&conn->rx_tsklet);
 	} else {
 		smc_cdc_tx_dismiss_slots(conn);
@@ -481,8 +482,10 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 {
 	smc_lgr_free_bufs(lgr);
 	if (lgr->is_smcd) {
-		smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
-		put_device(&lgr->smcd->dev);
+		if (!lgr->terminating) {
+			smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+			put_device(&lgr->smcd->dev);
+		}
 	} else {
 		smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
 		put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev);
@@ -503,6 +506,20 @@ void smc_lgr_forget(struct smc_link_group *lgr)
 	spin_unlock_bh(lgr_lock);
 }
 
+static void smcd_unregister_all_dmbs(struct smc_link_group *lgr)
+{
+	int i;
+
+	for (i = 0; i < SMC_RMBE_SIZES; i++) {
+		struct smc_buf_desc *buf_desc;
+
+		list_for_each_entry(buf_desc, &lgr->rmbs[i], list) {
+			buf_desc->len += sizeof(struct smcd_cdc_msg);
+			smc_ism_unregister_dmb(lgr->smcd, buf_desc);
+		}
+	}
+}
+
 static void smc_sk_wake_ups(struct smc_sock *smc)
 {
 	smc->sk.sk_write_space(&smc->sk);
@@ -522,12 +539,28 @@ static void smc_conn_kill(struct smc_connection *conn)
 	conn->killed = 1;
 	smc->sk.sk_err = ECONNABORTED;
 	smc_sk_wake_ups(smc);
-	if (conn->lgr->is_smcd)
+	if (conn->lgr->is_smcd) {
+		smc_ism_unset_conn(conn);
 		tasklet_kill(&conn->rx_tsklet);
+	}
 	smc_lgr_unregister_conn(conn);
 	smc_close_active_abort(smc);
 }
 
+static void smc_lgr_cleanup(struct smc_link_group *lgr)
+{
+	if (lgr->is_smcd) {
+		smc_ism_signal_shutdown(lgr);
+		smcd_unregister_all_dmbs(lgr);
+		smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
+		put_device(&lgr->smcd->dev);
+	} else {
+		struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
+		wake_up(&lnk->wr_reg_wait);
+	}
+}
+
 /* terminate link group */
 static void __smc_lgr_terminate(struct smc_link_group *lgr)
 {
@@ -557,8 +590,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr)
 		node = rb_first(&lgr->conns_all);
 	}
 	read_unlock_bh(&lgr->conns_lock);
-	if (!lgr->is_smcd)
-		wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
+	smc_lgr_cleanup(lgr);
 	smc_lgr_schedule_free_work_fast(lgr);
 }
 
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 18946e95a3be..903da947b20d 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -146,6 +146,10 @@ out:
 int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
 {
 	struct smcd_dmb dmb;
+	int rc = 0;
+
+	if (!dmb_desc->dma_addr)
+		return rc;
 
 	memset(&dmb, 0, sizeof(dmb));
 	dmb.dmb_tok = dmb_desc->token;
@@ -153,7 +157,13 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
 	dmb.cpu_addr = dmb_desc->cpu_addr;
 	dmb.dma_addr = dmb_desc->dma_addr;
 	dmb.dmb_len = dmb_desc->len;
-	return smcd->ops->unregister_dmb(smcd, &dmb);
+	rc = smcd->ops->unregister_dmb(smcd, &dmb);
+	if (!rc || rc == ISM_ERROR) {
+		dmb_desc->cpu_addr = NULL;
+		dmb_desc->dma_addr = 0;
+	}
+
+	return rc;
 }
 
 int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
@@ -375,7 +385,7 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno)
 
 	spin_lock_irqsave(&smcd->lock, flags);
 	conn = smcd->conn[dmbno];
-	if (conn)
+	if (conn && !conn->killed)
 		tasklet_schedule(&conn->rx_tsklet);
 	spin_unlock_irqrestore(&smcd->lock, flags);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 5edd6b9cb8d7c6c346c93c52a53735591127e879 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 14 Nov 2019 13:02:43 +0100
Subject: net/smc: introduce bookkeeping of SMCD link groups

If the ism module is unloaded return control from exit routine only,
if all link groups are freed.
If an IB device is thrown away return control from device removal only,
if all link groups belonging to this device are freed.
A counters for the total number of SMCD link groups per ISM device is
introduced. ism module unloading continues only if the total number of
SMCD link groups for all ISM devices is zero. ISM device
removal continues only it the total number of SMCD link groups per ISM
device has decreased to zero.

Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/smc.h  | 2 ++
 net/smc/smc_core.c | 6 ++++++
 net/smc/smc_ism.c  | 1 +
 3 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/smc.h b/include/net/smc.h
index 7c2082341bb3..646feb4bc75f 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -79,6 +79,8 @@ struct smcd_dev {
 	bool pnetid_by_user;
 	struct list_head lgr_list;
 	spinlock_t lgr_lock;
+	atomic_t lgr_cnt;
+	wait_queue_head_t lgrs_deleted;
 	u8 going_away : 1;
 };
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index d79dd78c1cd8..30854acb846c 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -276,6 +276,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		lgr_list = &ini->ism_dev->lgr_list;
 		lgr_lock = &lgr->smcd->lgr_lock;
 		lgr->peer_shutdown = 0;
+		atomic_inc(&ini->ism_dev->lgr_cnt);
 	} else {
 		/* SMC-R specific settings */
 		get_device(&ini->ib_dev->ibdev->dev);
@@ -486,6 +487,8 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 			smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
 			put_device(&lgr->smcd->dev);
 		}
+		if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
+			wake_up(&lgr->smcd->lgrs_deleted);
 	} else {
 		smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
 		put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev);
@@ -685,6 +688,9 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd)
 		list_del_init(&lgr->list);
 		__smc_lgr_terminate(lgr, false);
 	}
+
+	if (atomic_read(&smcd->lgr_cnt))
+		wait_event(smcd->lgrs_deleted, !atomic_read(&smcd->lgr_cnt));
 }
 
 /* Determine vlan of internal TCP socket.
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 56cdab8be1fa..5c4727d5066e 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -302,6 +302,7 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
 	spin_lock_init(&smcd->lgr_lock);
 	INIT_LIST_HEAD(&smcd->vlan);
 	INIT_LIST_HEAD(&smcd->lgr_list);
+	init_waitqueue_head(&smcd->lgrs_deleted);
 	smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
 						 WQ_MEM_RECLAIM, name);
 	if (!smcd->event_wq) {
-- 
cgit v1.2.3-59-g8ed1b


From 5e2563650232a4d998a60b10d3679f65dd4c02fb Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 14 Nov 2019 17:03:27 +0200
Subject: net: mscc: ocelot: publish structure definitions to
 include/soc/mscc/ocelot.h

We will be registering another switch driver based on ocelot, which
lives under drivers/net/dsa.

Make sure the Felix DSA front-end has the necessary abstractions to
implement a new Ocelot driver instantiation. This includes the function
prototypes for implementing DSA callbacks.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c |  78 +++---
 drivers/net/ethernet/mscc/ocelot.h | 482 +--------------------------------
 include/soc/mscc/ocelot.h          | 539 +++++++++++++++++++++++++++++++++++++
 3 files changed, 588 insertions(+), 511 deletions(-)
 create mode 100644 include/soc/mscc/ocelot.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 961f9a7c01e3..90c46ba763d7 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -21,7 +21,6 @@
 #include <net/netevent.h>
 #include <net/rtnetlink.h>
 #include <net/switchdev.h>
-#include <net/dsa.h>
 
 #include "ocelot.h"
 #include "ocelot_ace.h"
@@ -184,8 +183,8 @@ static void ocelot_vlan_mode(struct ocelot *ocelot, int port,
 	ocelot_write(ocelot, val, ANA_VLANMASK);
 }
 
-static void ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
-				       bool vlan_aware)
+void ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
+				bool vlan_aware)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	u32 val;
@@ -230,6 +229,7 @@ static void ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
 		       REW_TAG_CFG_TAG_CFG_M,
 		       REW_TAG_CFG, port);
 }
+EXPORT_SYMBOL(ocelot_port_vlan_filtering);
 
 static int ocelot_port_set_native_vlan(struct ocelot *ocelot, int port,
 				       u16 vid)
@@ -267,8 +267,8 @@ static void ocelot_port_set_pvid(struct ocelot *ocelot, int port, u16 pvid)
 	ocelot_port->pvid = pvid;
 }
 
-static int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid,
-			   bool untagged)
+int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid,
+		    bool untagged)
 {
 	int ret;
 
@@ -291,6 +291,7 @@ static int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid,
 
 	return 0;
 }
+EXPORT_SYMBOL(ocelot_vlan_add);
 
 static int ocelot_vlan_vid_add(struct net_device *dev, u16 vid, bool pvid,
 			       bool untagged)
@@ -312,7 +313,7 @@ static int ocelot_vlan_vid_add(struct net_device *dev, u16 vid, bool pvid,
 	return 0;
 }
 
-static int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid)
+int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	int ret;
@@ -333,6 +334,7 @@ static int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid)
 
 	return 0;
 }
+EXPORT_SYMBOL(ocelot_vlan_del);
 
 static int ocelot_vlan_vid_del(struct net_device *dev, u16 vid)
 {
@@ -404,8 +406,8 @@ static u16 ocelot_wm_enc(u16 value)
 	return value;
 }
 
-static void ocelot_adjust_link(struct ocelot *ocelot, int port,
-			       struct phy_device *phydev)
+void ocelot_adjust_link(struct ocelot *ocelot, int port,
+			struct phy_device *phydev)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	int speed, mode = 0;
@@ -471,6 +473,7 @@ static void ocelot_adjust_link(struct ocelot *ocelot, int port,
 			 SYS_MAC_FC_CFG, port);
 	ocelot_write_rix(ocelot, 0, ANA_POL_FLOWC, port);
 }
+EXPORT_SYMBOL(ocelot_adjust_link);
 
 static void ocelot_port_adjust_link(struct net_device *dev)
 {
@@ -481,8 +484,8 @@ static void ocelot_port_adjust_link(struct net_device *dev)
 	ocelot_adjust_link(ocelot, port, dev->phydev);
 }
 
-static void ocelot_port_enable(struct ocelot *ocelot, int port,
-			       struct phy_device *phy)
+void ocelot_port_enable(struct ocelot *ocelot, int port,
+			struct phy_device *phy)
 {
 	/* Enable receiving frames on the port, and activate auto-learning of
 	 * MAC addresses.
@@ -492,6 +495,7 @@ static void ocelot_port_enable(struct ocelot *ocelot, int port,
 			 ANA_PORT_PORT_CFG_PORTID_VAL(port),
 			 ANA_PORT_PORT_CFG, port);
 }
+EXPORT_SYMBOL(ocelot_port_enable);
 
 static int ocelot_port_open(struct net_device *dev)
 {
@@ -526,7 +530,7 @@ static int ocelot_port_open(struct net_device *dev)
 	return 0;
 }
 
-static void ocelot_port_disable(struct ocelot *ocelot, int port)
+void ocelot_port_disable(struct ocelot *ocelot, int port)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 
@@ -534,6 +538,7 @@ static void ocelot_port_disable(struct ocelot *ocelot, int port)
 	ocelot_rmw_rix(ocelot, 0, QSYS_SWITCH_PORT_MODE_PORT_ENA,
 		       QSYS_SWITCH_PORT_MODE, port);
 }
+EXPORT_SYMBOL(ocelot_port_disable);
 
 static int ocelot_port_stop(struct net_device *dev)
 {
@@ -790,9 +795,8 @@ static void ocelot_get_stats64(struct net_device *dev,
 	stats->collisions = ocelot_read(ocelot, SYS_COUNT_TX_COLLISION);
 }
 
-static int ocelot_fdb_add(struct ocelot *ocelot, int port,
-			  const unsigned char *addr, u16 vid,
-			  bool vlan_aware)
+int ocelot_fdb_add(struct ocelot *ocelot, int port,
+		   const unsigned char *addr, u16 vid, bool vlan_aware)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 
@@ -812,6 +816,7 @@ static int ocelot_fdb_add(struct ocelot *ocelot, int port,
 
 	return ocelot_mact_learn(ocelot, port, addr, vid, ENTRYTYPE_LOCKED);
 }
+EXPORT_SYMBOL(ocelot_fdb_add);
 
 static int ocelot_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 			       struct net_device *dev,
@@ -826,11 +831,12 @@ static int ocelot_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	return ocelot_fdb_add(ocelot, port, addr, vid, priv->vlan_aware);
 }
 
-static int ocelot_fdb_del(struct ocelot *ocelot, int port,
-			  const unsigned char *addr, u16 vid)
+int ocelot_fdb_del(struct ocelot *ocelot, int port,
+		   const unsigned char *addr, u16 vid)
 {
 	return ocelot_mact_forget(ocelot, addr, vid);
 }
+EXPORT_SYMBOL(ocelot_fdb_del);
 
 static int ocelot_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 			       struct net_device *dev,
@@ -940,8 +946,8 @@ static int ocelot_mact_read(struct ocelot *ocelot, int port, int row, int col,
 	return 0;
 }
 
-static int ocelot_fdb_dump(struct ocelot *ocelot, int port,
-			   dsa_fdb_dump_cb_t *cb, void *data)
+int ocelot_fdb_dump(struct ocelot *ocelot, int port,
+		    dsa_fdb_dump_cb_t *cb, void *data)
 {
 	int i, j;
 
@@ -973,6 +979,7 @@ static int ocelot_fdb_dump(struct ocelot *ocelot, int port,
 
 	return 0;
 }
+EXPORT_SYMBOL(ocelot_fdb_dump);
 
 static int ocelot_port_fdb_dump(struct sk_buff *skb,
 				struct netlink_callback *cb,
@@ -1153,8 +1160,7 @@ static const struct net_device_ops ocelot_port_netdev_ops = {
 	.ndo_do_ioctl			= ocelot_ioctl,
 };
 
-static void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset,
-			       u8 *data)
+void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data)
 {
 	int i;
 
@@ -1165,6 +1171,7 @@ static void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset,
 		memcpy(data + i * ETH_GSTRING_LEN, ocelot->stats_layout[i].name,
 		       ETH_GSTRING_LEN);
 }
+EXPORT_SYMBOL(ocelot_get_strings);
 
 static void ocelot_port_get_strings(struct net_device *netdev, u32 sset,
 				    u8 *data)
@@ -1216,7 +1223,7 @@ static void ocelot_check_stats_work(struct work_struct *work)
 			   OCELOT_STATS_CHECK_DELAY);
 }
 
-static void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data)
+void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data)
 {
 	int i;
 
@@ -1227,6 +1234,7 @@ static void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data)
 	for (i = 0; i < ocelot->num_stats; i++)
 		*data++ = ocelot->stats[port * ocelot->num_stats + i];
 }
+EXPORT_SYMBOL(ocelot_get_ethtool_stats);
 
 static void ocelot_port_get_ethtool_stats(struct net_device *dev,
 					  struct ethtool_stats *stats,
@@ -1239,13 +1247,14 @@ static void ocelot_port_get_ethtool_stats(struct net_device *dev,
 	ocelot_get_ethtool_stats(ocelot, port, data);
 }
 
-static int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset)
+int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset)
 {
 	if (sset != ETH_SS_STATS)
 		return -EOPNOTSUPP;
 
 	return ocelot->num_stats;
 }
+EXPORT_SYMBOL(ocelot_get_sset_count);
 
 static int ocelot_port_get_sset_count(struct net_device *dev, int sset)
 {
@@ -1256,8 +1265,8 @@ static int ocelot_port_get_sset_count(struct net_device *dev, int sset)
 	return ocelot_get_sset_count(ocelot, port, sset);
 }
 
-static int ocelot_get_ts_info(struct ocelot *ocelot, int port,
-			      struct ethtool_ts_info *info)
+int ocelot_get_ts_info(struct ocelot *ocelot, int port,
+		       struct ethtool_ts_info *info)
 {
 	info->phc_index = ocelot->ptp_clock ?
 			  ptp_clock_index(ocelot->ptp_clock) : -1;
@@ -1273,6 +1282,7 @@ static int ocelot_get_ts_info(struct ocelot *ocelot, int port,
 
 	return 0;
 }
+EXPORT_SYMBOL(ocelot_get_ts_info);
 
 static int ocelot_port_get_ts_info(struct net_device *dev,
 				   struct ethtool_ts_info *info)
@@ -1296,8 +1306,7 @@ static const struct ethtool_ops ocelot_ethtool_ops = {
 	.get_ts_info		= ocelot_port_get_ts_info,
 };
 
-static void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port,
-					u8 state)
+void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state)
 {
 	u32 port_cfg;
 	int p, i;
@@ -1358,6 +1367,7 @@ static void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port,
 		}
 	}
 }
+EXPORT_SYMBOL(ocelot_bridge_stp_state_set);
 
 static void ocelot_port_attr_stp_state_set(struct ocelot *ocelot, int port,
 					   struct switchdev_trans *trans,
@@ -1369,11 +1379,12 @@ static void ocelot_port_attr_stp_state_set(struct ocelot *ocelot, int port,
 	ocelot_bridge_stp_state_set(ocelot, port, state);
 }
 
-static void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs)
+void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs)
 {
 	ocelot_write(ocelot, ANA_AUTOAGE_AGE_PERIOD(msecs / 2),
 		     ANA_AUTOAGE);
 }
+EXPORT_SYMBOL(ocelot_set_ageing_time);
 
 static void ocelot_port_attr_ageing_set(struct ocelot *ocelot, int port,
 					unsigned long ageing_clock_t)
@@ -1604,8 +1615,8 @@ static int ocelot_port_obj_del(struct net_device *dev,
 	return ret;
 }
 
-static int ocelot_port_bridge_join(struct ocelot *ocelot, int port,
-				   struct net_device *bridge)
+int ocelot_port_bridge_join(struct ocelot *ocelot, int port,
+			    struct net_device *bridge)
 {
 	if (!ocelot->bridge_mask) {
 		ocelot->hw_bridge_dev = bridge;
@@ -1620,9 +1631,10 @@ static int ocelot_port_bridge_join(struct ocelot *ocelot, int port,
 
 	return 0;
 }
+EXPORT_SYMBOL(ocelot_port_bridge_join);
 
-static int ocelot_port_bridge_leave(struct ocelot *ocelot, int port,
-				    struct net_device *bridge)
+int ocelot_port_bridge_leave(struct ocelot *ocelot, int port,
+			     struct net_device *bridge)
 {
 	ocelot->bridge_mask &= ~BIT(port);
 
@@ -1633,6 +1645,7 @@ static int ocelot_port_bridge_leave(struct ocelot *ocelot, int port,
 	ocelot_port_set_pvid(ocelot, port, 0);
 	return ocelot_port_set_native_vlan(ocelot, port, 0);
 }
+EXPORT_SYMBOL(ocelot_port_bridge_leave);
 
 static void ocelot_set_aggr_pgids(struct ocelot *ocelot)
 {
@@ -2119,7 +2132,7 @@ static void ocelot_port_set_mtu(struct ocelot *ocelot, int port, size_t mtu)
 	ocelot_write(ocelot, ocelot_wm_enc(atop_wm), SYS_ATOP_TOT_CFG);
 }
 
-static void ocelot_init_port(struct ocelot *ocelot, int port)
+void ocelot_init_port(struct ocelot *ocelot, int port)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 
@@ -2166,6 +2179,7 @@ static void ocelot_init_port(struct ocelot *ocelot, int port)
 	/* Enable vcap lookups */
 	ocelot_vcap_enable(ocelot, port);
 }
+EXPORT_SYMBOL(ocelot_init_port);
 
 int ocelot_probe_port(struct ocelot *ocelot, u8 port,
 		      void __iomem *regs,
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index 199ca2d6ea32..325afea3e846 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -18,6 +18,7 @@
 #include <linux/ptp_clock_kernel.h>
 #include <linux/regmap.h>
 
+#include <soc/mscc/ocelot.h>
 #include "ocelot_ana.h"
 #include "ocelot_dev.h"
 #include "ocelot_qsys.h"
@@ -52,376 +53,6 @@ struct frame_info {
 	u32 timestamp;	/* rew_val */
 };
 
-#define IFH_INJ_BYPASS	BIT(31)
-#define IFH_INJ_POP_CNT_DISABLE (3 << 28)
-
-#define IFH_TAG_TYPE_C 0
-#define IFH_TAG_TYPE_S 1
-
-#define IFH_REW_OP_NOOP			0x0
-#define IFH_REW_OP_DSCP			0x1
-#define IFH_REW_OP_ONE_STEP_PTP		0x2
-#define IFH_REW_OP_TWO_STEP_PTP		0x3
-#define IFH_REW_OP_ORIGIN_PTP		0x5
-
-#define OCELOT_TAG_LEN			16
-#define OCELOT_SHORT_PREFIX_LEN		4
-#define OCELOT_LONG_PREFIX_LEN		16
-
-#define OCELOT_SPEED_2500 0
-#define OCELOT_SPEED_1000 1
-#define OCELOT_SPEED_100  2
-#define OCELOT_SPEED_10   3
-
-#define TARGET_OFFSET 24
-#define REG_MASK GENMASK(TARGET_OFFSET - 1, 0)
-#define REG(reg, offset) [reg & REG_MASK] = offset
-
-enum ocelot_target {
-	ANA = 1,
-	QS,
-	QSYS,
-	REW,
-	SYS,
-	S2,
-	HSIO,
-	PTP,
-	TARGET_MAX,
-};
-
-enum ocelot_reg {
-	ANA_ADVLEARN = ANA << TARGET_OFFSET,
-	ANA_VLANMASK,
-	ANA_PORT_B_DOMAIN,
-	ANA_ANAGEFIL,
-	ANA_ANEVENTS,
-	ANA_STORMLIMIT_BURST,
-	ANA_STORMLIMIT_CFG,
-	ANA_ISOLATED_PORTS,
-	ANA_COMMUNITY_PORTS,
-	ANA_AUTOAGE,
-	ANA_MACTOPTIONS,
-	ANA_LEARNDISC,
-	ANA_AGENCTRL,
-	ANA_MIRRORPORTS,
-	ANA_EMIRRORPORTS,
-	ANA_FLOODING,
-	ANA_FLOODING_IPMC,
-	ANA_SFLOW_CFG,
-	ANA_PORT_MODE,
-	ANA_CUT_THRU_CFG,
-	ANA_PGID_PGID,
-	ANA_TABLES_ANMOVED,
-	ANA_TABLES_MACHDATA,
-	ANA_TABLES_MACLDATA,
-	ANA_TABLES_STREAMDATA,
-	ANA_TABLES_MACACCESS,
-	ANA_TABLES_MACTINDX,
-	ANA_TABLES_VLANACCESS,
-	ANA_TABLES_VLANTIDX,
-	ANA_TABLES_ISDXACCESS,
-	ANA_TABLES_ISDXTIDX,
-	ANA_TABLES_ENTRYLIM,
-	ANA_TABLES_PTP_ID_HIGH,
-	ANA_TABLES_PTP_ID_LOW,
-	ANA_TABLES_STREAMACCESS,
-	ANA_TABLES_STREAMTIDX,
-	ANA_TABLES_SEQ_HISTORY,
-	ANA_TABLES_SEQ_MASK,
-	ANA_TABLES_SFID_MASK,
-	ANA_TABLES_SFIDACCESS,
-	ANA_TABLES_SFIDTIDX,
-	ANA_MSTI_STATE,
-	ANA_OAM_UPM_LM_CNT,
-	ANA_SG_ACCESS_CTRL,
-	ANA_SG_CONFIG_REG_1,
-	ANA_SG_CONFIG_REG_2,
-	ANA_SG_CONFIG_REG_3,
-	ANA_SG_CONFIG_REG_4,
-	ANA_SG_CONFIG_REG_5,
-	ANA_SG_GCL_GS_CONFIG,
-	ANA_SG_GCL_TI_CONFIG,
-	ANA_SG_STATUS_REG_1,
-	ANA_SG_STATUS_REG_2,
-	ANA_SG_STATUS_REG_3,
-	ANA_PORT_VLAN_CFG,
-	ANA_PORT_DROP_CFG,
-	ANA_PORT_QOS_CFG,
-	ANA_PORT_VCAP_CFG,
-	ANA_PORT_VCAP_S1_KEY_CFG,
-	ANA_PORT_VCAP_S2_CFG,
-	ANA_PORT_PCP_DEI_MAP,
-	ANA_PORT_CPU_FWD_CFG,
-	ANA_PORT_CPU_FWD_BPDU_CFG,
-	ANA_PORT_CPU_FWD_GARP_CFG,
-	ANA_PORT_CPU_FWD_CCM_CFG,
-	ANA_PORT_PORT_CFG,
-	ANA_PORT_POL_CFG,
-	ANA_PORT_PTP_CFG,
-	ANA_PORT_PTP_DLY1_CFG,
-	ANA_PORT_PTP_DLY2_CFG,
-	ANA_PORT_SFID_CFG,
-	ANA_PFC_PFC_CFG,
-	ANA_PFC_PFC_TIMER,
-	ANA_IPT_OAM_MEP_CFG,
-	ANA_IPT_IPT,
-	ANA_PPT_PPT,
-	ANA_FID_MAP_FID_MAP,
-	ANA_AGGR_CFG,
-	ANA_CPUQ_CFG,
-	ANA_CPUQ_CFG2,
-	ANA_CPUQ_8021_CFG,
-	ANA_DSCP_CFG,
-	ANA_DSCP_REWR_CFG,
-	ANA_VCAP_RNG_TYPE_CFG,
-	ANA_VCAP_RNG_VAL_CFG,
-	ANA_VRAP_CFG,
-	ANA_VRAP_HDR_DATA,
-	ANA_VRAP_HDR_MASK,
-	ANA_DISCARD_CFG,
-	ANA_FID_CFG,
-	ANA_POL_PIR_CFG,
-	ANA_POL_CIR_CFG,
-	ANA_POL_MODE_CFG,
-	ANA_POL_PIR_STATE,
-	ANA_POL_CIR_STATE,
-	ANA_POL_STATE,
-	ANA_POL_FLOWC,
-	ANA_POL_HYST,
-	ANA_POL_MISC_CFG,
-	QS_XTR_GRP_CFG = QS << TARGET_OFFSET,
-	QS_XTR_RD,
-	QS_XTR_FRM_PRUNING,
-	QS_XTR_FLUSH,
-	QS_XTR_DATA_PRESENT,
-	QS_XTR_CFG,
-	QS_INJ_GRP_CFG,
-	QS_INJ_WR,
-	QS_INJ_CTRL,
-	QS_INJ_STATUS,
-	QS_INJ_ERR,
-	QS_INH_DBG,
-	QSYS_PORT_MODE = QSYS << TARGET_OFFSET,
-	QSYS_SWITCH_PORT_MODE,
-	QSYS_STAT_CNT_CFG,
-	QSYS_EEE_CFG,
-	QSYS_EEE_THRES,
-	QSYS_IGR_NO_SHARING,
-	QSYS_EGR_NO_SHARING,
-	QSYS_SW_STATUS,
-	QSYS_EXT_CPU_CFG,
-	QSYS_PAD_CFG,
-	QSYS_CPU_GROUP_MAP,
-	QSYS_QMAP,
-	QSYS_ISDX_SGRP,
-	QSYS_TIMED_FRAME_ENTRY,
-	QSYS_TFRM_MISC,
-	QSYS_TFRM_PORT_DLY,
-	QSYS_TFRM_TIMER_CFG_1,
-	QSYS_TFRM_TIMER_CFG_2,
-	QSYS_TFRM_TIMER_CFG_3,
-	QSYS_TFRM_TIMER_CFG_4,
-	QSYS_TFRM_TIMER_CFG_5,
-	QSYS_TFRM_TIMER_CFG_6,
-	QSYS_TFRM_TIMER_CFG_7,
-	QSYS_TFRM_TIMER_CFG_8,
-	QSYS_RED_PROFILE,
-	QSYS_RES_QOS_MODE,
-	QSYS_RES_CFG,
-	QSYS_RES_STAT,
-	QSYS_EGR_DROP_MODE,
-	QSYS_EQ_CTRL,
-	QSYS_EVENTS_CORE,
-	QSYS_QMAXSDU_CFG_0,
-	QSYS_QMAXSDU_CFG_1,
-	QSYS_QMAXSDU_CFG_2,
-	QSYS_QMAXSDU_CFG_3,
-	QSYS_QMAXSDU_CFG_4,
-	QSYS_QMAXSDU_CFG_5,
-	QSYS_QMAXSDU_CFG_6,
-	QSYS_QMAXSDU_CFG_7,
-	QSYS_PREEMPTION_CFG,
-	QSYS_CIR_CFG,
-	QSYS_EIR_CFG,
-	QSYS_SE_CFG,
-	QSYS_SE_DWRR_CFG,
-	QSYS_SE_CONNECT,
-	QSYS_SE_DLB_SENSE,
-	QSYS_CIR_STATE,
-	QSYS_EIR_STATE,
-	QSYS_SE_STATE,
-	QSYS_HSCH_MISC_CFG,
-	QSYS_TAG_CONFIG,
-	QSYS_TAS_PARAM_CFG_CTRL,
-	QSYS_PORT_MAX_SDU,
-	QSYS_PARAM_CFG_REG_1,
-	QSYS_PARAM_CFG_REG_2,
-	QSYS_PARAM_CFG_REG_3,
-	QSYS_PARAM_CFG_REG_4,
-	QSYS_PARAM_CFG_REG_5,
-	QSYS_GCL_CFG_REG_1,
-	QSYS_GCL_CFG_REG_2,
-	QSYS_PARAM_STATUS_REG_1,
-	QSYS_PARAM_STATUS_REG_2,
-	QSYS_PARAM_STATUS_REG_3,
-	QSYS_PARAM_STATUS_REG_4,
-	QSYS_PARAM_STATUS_REG_5,
-	QSYS_PARAM_STATUS_REG_6,
-	QSYS_PARAM_STATUS_REG_7,
-	QSYS_PARAM_STATUS_REG_8,
-	QSYS_PARAM_STATUS_REG_9,
-	QSYS_GCL_STATUS_REG_1,
-	QSYS_GCL_STATUS_REG_2,
-	REW_PORT_VLAN_CFG = REW << TARGET_OFFSET,
-	REW_TAG_CFG,
-	REW_PORT_CFG,
-	REW_DSCP_CFG,
-	REW_PCP_DEI_QOS_MAP_CFG,
-	REW_PTP_CFG,
-	REW_PTP_DLY1_CFG,
-	REW_RED_TAG_CFG,
-	REW_DSCP_REMAP_DP1_CFG,
-	REW_DSCP_REMAP_CFG,
-	REW_STAT_CFG,
-	REW_REW_STICKY,
-	REW_PPT,
-	SYS_COUNT_RX_OCTETS = SYS << TARGET_OFFSET,
-	SYS_COUNT_RX_UNICAST,
-	SYS_COUNT_RX_MULTICAST,
-	SYS_COUNT_RX_BROADCAST,
-	SYS_COUNT_RX_SHORTS,
-	SYS_COUNT_RX_FRAGMENTS,
-	SYS_COUNT_RX_JABBERS,
-	SYS_COUNT_RX_CRC_ALIGN_ERRS,
-	SYS_COUNT_RX_SYM_ERRS,
-	SYS_COUNT_RX_64,
-	SYS_COUNT_RX_65_127,
-	SYS_COUNT_RX_128_255,
-	SYS_COUNT_RX_256_1023,
-	SYS_COUNT_RX_1024_1526,
-	SYS_COUNT_RX_1527_MAX,
-	SYS_COUNT_RX_PAUSE,
-	SYS_COUNT_RX_CONTROL,
-	SYS_COUNT_RX_LONGS,
-	SYS_COUNT_RX_CLASSIFIED_DROPS,
-	SYS_COUNT_TX_OCTETS,
-	SYS_COUNT_TX_UNICAST,
-	SYS_COUNT_TX_MULTICAST,
-	SYS_COUNT_TX_BROADCAST,
-	SYS_COUNT_TX_COLLISION,
-	SYS_COUNT_TX_DROPS,
-	SYS_COUNT_TX_PAUSE,
-	SYS_COUNT_TX_64,
-	SYS_COUNT_TX_65_127,
-	SYS_COUNT_TX_128_511,
-	SYS_COUNT_TX_512_1023,
-	SYS_COUNT_TX_1024_1526,
-	SYS_COUNT_TX_1527_MAX,
-	SYS_COUNT_TX_AGING,
-	SYS_RESET_CFG,
-	SYS_SR_ETYPE_CFG,
-	SYS_VLAN_ETYPE_CFG,
-	SYS_PORT_MODE,
-	SYS_FRONT_PORT_MODE,
-	SYS_FRM_AGING,
-	SYS_STAT_CFG,
-	SYS_SW_STATUS,
-	SYS_MISC_CFG,
-	SYS_REW_MAC_HIGH_CFG,
-	SYS_REW_MAC_LOW_CFG,
-	SYS_TIMESTAMP_OFFSET,
-	SYS_CMID,
-	SYS_PAUSE_CFG,
-	SYS_PAUSE_TOT_CFG,
-	SYS_ATOP,
-	SYS_ATOP_TOT_CFG,
-	SYS_MAC_FC_CFG,
-	SYS_MMGT,
-	SYS_MMGT_FAST,
-	SYS_EVENTS_DIF,
-	SYS_EVENTS_CORE,
-	SYS_CNT,
-	SYS_PTP_STATUS,
-	SYS_PTP_TXSTAMP,
-	SYS_PTP_NXT,
-	SYS_PTP_CFG,
-	SYS_RAM_INIT,
-	SYS_CM_ADDR,
-	SYS_CM_DATA_WR,
-	SYS_CM_DATA_RD,
-	SYS_CM_OP,
-	SYS_CM_DATA,
-	S2_CORE_UPDATE_CTRL = S2 << TARGET_OFFSET,
-	S2_CORE_MV_CFG,
-	S2_CACHE_ENTRY_DAT,
-	S2_CACHE_MASK_DAT,
-	S2_CACHE_ACTION_DAT,
-	S2_CACHE_CNT_DAT,
-	S2_CACHE_TG_DAT,
-	PTP_PIN_CFG = PTP << TARGET_OFFSET,
-	PTP_PIN_TOD_SEC_MSB,
-	PTP_PIN_TOD_SEC_LSB,
-	PTP_PIN_TOD_NSEC,
-	PTP_CFG_MISC,
-	PTP_CLK_CFG_ADJ_CFG,
-	PTP_CLK_CFG_ADJ_FREQ,
-};
-
-enum ocelot_regfield {
-	ANA_ADVLEARN_VLAN_CHK,
-	ANA_ADVLEARN_LEARN_MIRROR,
-	ANA_ANEVENTS_FLOOD_DISCARD,
-	ANA_ANEVENTS_MSTI_DROP,
-	ANA_ANEVENTS_ACLKILL,
-	ANA_ANEVENTS_ACLUSED,
-	ANA_ANEVENTS_AUTOAGE,
-	ANA_ANEVENTS_VS2TTL1,
-	ANA_ANEVENTS_STORM_DROP,
-	ANA_ANEVENTS_LEARN_DROP,
-	ANA_ANEVENTS_AGED_ENTRY,
-	ANA_ANEVENTS_CPU_LEARN_FAILED,
-	ANA_ANEVENTS_AUTO_LEARN_FAILED,
-	ANA_ANEVENTS_LEARN_REMOVE,
-	ANA_ANEVENTS_AUTO_LEARNED,
-	ANA_ANEVENTS_AUTO_MOVED,
-	ANA_ANEVENTS_DROPPED,
-	ANA_ANEVENTS_CLASSIFIED_DROP,
-	ANA_ANEVENTS_CLASSIFIED_COPY,
-	ANA_ANEVENTS_VLAN_DISCARD,
-	ANA_ANEVENTS_FWD_DISCARD,
-	ANA_ANEVENTS_MULTICAST_FLOOD,
-	ANA_ANEVENTS_UNICAST_FLOOD,
-	ANA_ANEVENTS_DEST_KNOWN,
-	ANA_ANEVENTS_BUCKET3_MATCH,
-	ANA_ANEVENTS_BUCKET2_MATCH,
-	ANA_ANEVENTS_BUCKET1_MATCH,
-	ANA_ANEVENTS_BUCKET0_MATCH,
-	ANA_ANEVENTS_CPU_OPERATION,
-	ANA_ANEVENTS_DMAC_LOOKUP,
-	ANA_ANEVENTS_SMAC_LOOKUP,
-	ANA_ANEVENTS_SEQ_GEN_ERR_0,
-	ANA_ANEVENTS_SEQ_GEN_ERR_1,
-	ANA_TABLES_MACACCESS_B_DOM,
-	ANA_TABLES_MACTINDX_BUCKET,
-	ANA_TABLES_MACTINDX_M_INDEX,
-	QSYS_TIMED_FRAME_ENTRY_TFRM_VLD,
-	QSYS_TIMED_FRAME_ENTRY_TFRM_FP,
-	QSYS_TIMED_FRAME_ENTRY_TFRM_PORTNO,
-	QSYS_TIMED_FRAME_ENTRY_TFRM_TM_SEL,
-	QSYS_TIMED_FRAME_ENTRY_TFRM_TM_T,
-	SYS_RESET_CFG_CORE_ENA,
-	SYS_RESET_CFG_MEM_ENA,
-	SYS_RESET_CFG_MEM_INIT,
-	REGFIELD_MAX
-};
-
-enum ocelot_clk_pins {
-	ALT_PPS_PIN	= 1,
-	EXT_CLK_PIN,
-	ALT_LDST_PIN,
-	TOD_ACC_PIN
-};
-
 struct ocelot_multicast {
 	struct list_head list;
 	unsigned char addr[ETH_ALEN];
@@ -429,88 +60,6 @@ struct ocelot_multicast {
 	u16 ports;
 };
 
-enum ocelot_tag_prefix {
-	OCELOT_TAG_PREFIX_DISABLED	= 0,
-	OCELOT_TAG_PREFIX_NONE,
-	OCELOT_TAG_PREFIX_SHORT,
-	OCELOT_TAG_PREFIX_LONG,
-};
-
-struct ocelot_port;
-struct ocelot;
-
-struct ocelot_stat_layout {
-	u32 offset;
-	char name[ETH_GSTRING_LEN];
-};
-
-struct ocelot_ops {
-	void (*pcs_init)(struct ocelot *ocelot, int port);
-	int (*reset)(struct ocelot *ocelot);
-};
-
-struct ocelot {
-	const struct ocelot_ops *ops;
-	struct device *dev;
-
-	struct regmap *targets[TARGET_MAX];
-	struct regmap_field *regfields[REGFIELD_MAX];
-	const u32 *const *map;
-	const struct ocelot_stat_layout *stats_layout;
-	unsigned int num_stats;
-
-	u8 base_mac[ETH_ALEN];
-
-	struct net_device *hw_bridge_dev;
-	u16 bridge_mask;
-	u16 bridge_fwd_mask;
-
-	struct workqueue_struct *ocelot_owq;
-
-	int shared_queue_sz;
-
-	u8 num_phys_ports;
-	u8 num_cpu_ports;
-	u8 cpu;
-	struct ocelot_port **ports;
-
-	u32 *lags;
-
-	/* Keep track of the vlan port masks */
-	u32 vlan_mask[VLAN_N_VID];
-
-	struct list_head multicast;
-
-	/* Workqueue to check statistics for overflow with its lock */
-	struct mutex stats_lock;
-	u64 *stats;
-	struct delayed_work stats_work;
-	struct workqueue_struct *stats_queue;
-
-	u8 ptp:1;
-	struct ptp_clock *ptp_clock;
-	struct ptp_clock_info ptp_info;
-	struct hwtstamp_config hwtstamp_config;
-	struct mutex ptp_lock; /* Protects the PTP interface state */
-	spinlock_t ptp_clock_lock; /* Protects the PTP clock */
-};
-
-struct ocelot_port {
-	struct ocelot *ocelot;
-
-	void __iomem *regs;
-
-	/* Ingress default VLAN (pvid) */
-	u16 pvid;
-
-	/* Egress default VLAN (vid) */
-	u16 vid;
-
-	u8 ptp_cmd;
-	struct list_head skbs;
-	u8 ts_id;
-};
-
 struct ocelot_port_private {
 	struct ocelot_port port;
 	struct net_device *dev;
@@ -531,37 +80,12 @@ struct ocelot_skb {
 	u8 id;
 };
 
-u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset);
-#define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
-#define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi))
-#define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri))
-#define ocelot_read(ocelot, reg) __ocelot_read_ix(ocelot, reg, 0)
-
-void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset);
-#define ocelot_write_ix(ocelot, val, reg, gi, ri) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
-#define ocelot_write_gix(ocelot, val, reg, gi) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi))
-#define ocelot_write_rix(ocelot, val, reg, ri) __ocelot_write_ix(ocelot, val, reg, reg##_RSZ * (ri))
-#define ocelot_write(ocelot, val, reg) __ocelot_write_ix(ocelot, val, reg, 0)
-
-void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
-		     u32 offset);
-#define ocelot_rmw_ix(ocelot, val, m, reg, gi, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
-#define ocelot_rmw_gix(ocelot, val, m, reg, gi) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi))
-#define ocelot_rmw_rix(ocelot, val, m, reg, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_RSZ * (ri))
-#define ocelot_rmw(ocelot, val, m, reg) __ocelot_rmw_ix(ocelot, val, m, reg, 0)
-
 u32 ocelot_port_readl(struct ocelot_port *port, u32 reg);
 void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg);
 
-int ocelot_regfields_init(struct ocelot *ocelot,
-			  const struct reg_field *const regfields);
-struct regmap *ocelot_regmap_init(struct ocelot *ocelot, struct resource *res);
-
 #define ocelot_field_write(ocelot, reg, val) regmap_field_write((ocelot)->regfields[(reg)], (val))
 #define ocelot_field_read(ocelot, reg, val) regmap_field_read((ocelot)->regfields[(reg)], (val))
 
-int ocelot_init(struct ocelot *ocelot);
-void ocelot_deinit(struct ocelot *ocelot);
 int ocelot_chip_init(struct ocelot *ocelot, const struct ocelot_ops *ops);
 int ocelot_probe_port(struct ocelot *ocelot, u8 port,
 		      void __iomem *regs,
@@ -575,7 +99,7 @@ extern struct notifier_block ocelot_netdevice_nb;
 extern struct notifier_block ocelot_switchdev_nb;
 extern struct notifier_block ocelot_switchdev_blocking_nb;
 
-int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts);
-void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts);
+#define ocelot_field_write(ocelot, reg, val) regmap_field_write((ocelot)->regfields[(reg)], (val))
+#define ocelot_field_read(ocelot, reg, val) regmap_field_read((ocelot)->regfields[(reg)], (val))
 
 #endif
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
new file mode 100644
index 000000000000..a836afe8f68e
--- /dev/null
+++ b/include/soc/mscc/ocelot.h
@@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/* Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _SOC_MSCC_OCELOT_H
+#define _SOC_MSCC_OCELOT_H
+
+#include <linux/ptp_clock_kernel.h>
+#include <linux/net_tstamp.h>
+#include <linux/if_vlan.h>
+#include <linux/regmap.h>
+#include <net/dsa.h>
+
+#define IFH_INJ_BYPASS			BIT(31)
+#define IFH_INJ_POP_CNT_DISABLE		(3 << 28)
+
+#define IFH_TAG_TYPE_C			0
+#define IFH_TAG_TYPE_S			1
+
+#define IFH_REW_OP_NOOP			0x0
+#define IFH_REW_OP_DSCP			0x1
+#define IFH_REW_OP_ONE_STEP_PTP		0x2
+#define IFH_REW_OP_TWO_STEP_PTP		0x3
+#define IFH_REW_OP_ORIGIN_PTP		0x5
+
+#define OCELOT_TAG_LEN			16
+#define OCELOT_SHORT_PREFIX_LEN		4
+#define OCELOT_LONG_PREFIX_LEN		16
+
+#define OCELOT_SPEED_2500		0
+#define OCELOT_SPEED_1000		1
+#define OCELOT_SPEED_100		2
+#define OCELOT_SPEED_10			3
+
+#define TARGET_OFFSET			24
+#define REG_MASK			GENMASK(TARGET_OFFSET - 1, 0)
+#define REG(reg, offset)		[reg & REG_MASK] = offset
+
+#define REG_RESERVED_ADDR		0xffffffff
+#define REG_RESERVED(reg)		REG(reg, REG_RESERVED_ADDR)
+
+enum ocelot_target {
+	ANA = 1,
+	QS,
+	QSYS,
+	REW,
+	SYS,
+	S2,
+	HSIO,
+	PTP,
+	GCB,
+	TARGET_MAX,
+};
+
+enum ocelot_reg {
+	ANA_ADVLEARN = ANA << TARGET_OFFSET,
+	ANA_VLANMASK,
+	ANA_PORT_B_DOMAIN,
+	ANA_ANAGEFIL,
+	ANA_ANEVENTS,
+	ANA_STORMLIMIT_BURST,
+	ANA_STORMLIMIT_CFG,
+	ANA_ISOLATED_PORTS,
+	ANA_COMMUNITY_PORTS,
+	ANA_AUTOAGE,
+	ANA_MACTOPTIONS,
+	ANA_LEARNDISC,
+	ANA_AGENCTRL,
+	ANA_MIRRORPORTS,
+	ANA_EMIRRORPORTS,
+	ANA_FLOODING,
+	ANA_FLOODING_IPMC,
+	ANA_SFLOW_CFG,
+	ANA_PORT_MODE,
+	ANA_CUT_THRU_CFG,
+	ANA_PGID_PGID,
+	ANA_TABLES_ANMOVED,
+	ANA_TABLES_MACHDATA,
+	ANA_TABLES_MACLDATA,
+	ANA_TABLES_STREAMDATA,
+	ANA_TABLES_MACACCESS,
+	ANA_TABLES_MACTINDX,
+	ANA_TABLES_VLANACCESS,
+	ANA_TABLES_VLANTIDX,
+	ANA_TABLES_ISDXACCESS,
+	ANA_TABLES_ISDXTIDX,
+	ANA_TABLES_ENTRYLIM,
+	ANA_TABLES_PTP_ID_HIGH,
+	ANA_TABLES_PTP_ID_LOW,
+	ANA_TABLES_STREAMACCESS,
+	ANA_TABLES_STREAMTIDX,
+	ANA_TABLES_SEQ_HISTORY,
+	ANA_TABLES_SEQ_MASK,
+	ANA_TABLES_SFID_MASK,
+	ANA_TABLES_SFIDACCESS,
+	ANA_TABLES_SFIDTIDX,
+	ANA_MSTI_STATE,
+	ANA_OAM_UPM_LM_CNT,
+	ANA_SG_ACCESS_CTRL,
+	ANA_SG_CONFIG_REG_1,
+	ANA_SG_CONFIG_REG_2,
+	ANA_SG_CONFIG_REG_3,
+	ANA_SG_CONFIG_REG_4,
+	ANA_SG_CONFIG_REG_5,
+	ANA_SG_GCL_GS_CONFIG,
+	ANA_SG_GCL_TI_CONFIG,
+	ANA_SG_STATUS_REG_1,
+	ANA_SG_STATUS_REG_2,
+	ANA_SG_STATUS_REG_3,
+	ANA_PORT_VLAN_CFG,
+	ANA_PORT_DROP_CFG,
+	ANA_PORT_QOS_CFG,
+	ANA_PORT_VCAP_CFG,
+	ANA_PORT_VCAP_S1_KEY_CFG,
+	ANA_PORT_VCAP_S2_CFG,
+	ANA_PORT_PCP_DEI_MAP,
+	ANA_PORT_CPU_FWD_CFG,
+	ANA_PORT_CPU_FWD_BPDU_CFG,
+	ANA_PORT_CPU_FWD_GARP_CFG,
+	ANA_PORT_CPU_FWD_CCM_CFG,
+	ANA_PORT_PORT_CFG,
+	ANA_PORT_POL_CFG,
+	ANA_PORT_PTP_CFG,
+	ANA_PORT_PTP_DLY1_CFG,
+	ANA_PORT_PTP_DLY2_CFG,
+	ANA_PORT_SFID_CFG,
+	ANA_PFC_PFC_CFG,
+	ANA_PFC_PFC_TIMER,
+	ANA_IPT_OAM_MEP_CFG,
+	ANA_IPT_IPT,
+	ANA_PPT_PPT,
+	ANA_FID_MAP_FID_MAP,
+	ANA_AGGR_CFG,
+	ANA_CPUQ_CFG,
+	ANA_CPUQ_CFG2,
+	ANA_CPUQ_8021_CFG,
+	ANA_DSCP_CFG,
+	ANA_DSCP_REWR_CFG,
+	ANA_VCAP_RNG_TYPE_CFG,
+	ANA_VCAP_RNG_VAL_CFG,
+	ANA_VRAP_CFG,
+	ANA_VRAP_HDR_DATA,
+	ANA_VRAP_HDR_MASK,
+	ANA_DISCARD_CFG,
+	ANA_FID_CFG,
+	ANA_POL_PIR_CFG,
+	ANA_POL_CIR_CFG,
+	ANA_POL_MODE_CFG,
+	ANA_POL_PIR_STATE,
+	ANA_POL_CIR_STATE,
+	ANA_POL_STATE,
+	ANA_POL_FLOWC,
+	ANA_POL_HYST,
+	ANA_POL_MISC_CFG,
+	QS_XTR_GRP_CFG = QS << TARGET_OFFSET,
+	QS_XTR_RD,
+	QS_XTR_FRM_PRUNING,
+	QS_XTR_FLUSH,
+	QS_XTR_DATA_PRESENT,
+	QS_XTR_CFG,
+	QS_INJ_GRP_CFG,
+	QS_INJ_WR,
+	QS_INJ_CTRL,
+	QS_INJ_STATUS,
+	QS_INJ_ERR,
+	QS_INH_DBG,
+	QSYS_PORT_MODE = QSYS << TARGET_OFFSET,
+	QSYS_SWITCH_PORT_MODE,
+	QSYS_STAT_CNT_CFG,
+	QSYS_EEE_CFG,
+	QSYS_EEE_THRES,
+	QSYS_IGR_NO_SHARING,
+	QSYS_EGR_NO_SHARING,
+	QSYS_SW_STATUS,
+	QSYS_EXT_CPU_CFG,
+	QSYS_PAD_CFG,
+	QSYS_CPU_GROUP_MAP,
+	QSYS_QMAP,
+	QSYS_ISDX_SGRP,
+	QSYS_TIMED_FRAME_ENTRY,
+	QSYS_TFRM_MISC,
+	QSYS_TFRM_PORT_DLY,
+	QSYS_TFRM_TIMER_CFG_1,
+	QSYS_TFRM_TIMER_CFG_2,
+	QSYS_TFRM_TIMER_CFG_3,
+	QSYS_TFRM_TIMER_CFG_4,
+	QSYS_TFRM_TIMER_CFG_5,
+	QSYS_TFRM_TIMER_CFG_6,
+	QSYS_TFRM_TIMER_CFG_7,
+	QSYS_TFRM_TIMER_CFG_8,
+	QSYS_RED_PROFILE,
+	QSYS_RES_QOS_MODE,
+	QSYS_RES_CFG,
+	QSYS_RES_STAT,
+	QSYS_EGR_DROP_MODE,
+	QSYS_EQ_CTRL,
+	QSYS_EVENTS_CORE,
+	QSYS_QMAXSDU_CFG_0,
+	QSYS_QMAXSDU_CFG_1,
+	QSYS_QMAXSDU_CFG_2,
+	QSYS_QMAXSDU_CFG_3,
+	QSYS_QMAXSDU_CFG_4,
+	QSYS_QMAXSDU_CFG_5,
+	QSYS_QMAXSDU_CFG_6,
+	QSYS_QMAXSDU_CFG_7,
+	QSYS_PREEMPTION_CFG,
+	QSYS_CIR_CFG,
+	QSYS_EIR_CFG,
+	QSYS_SE_CFG,
+	QSYS_SE_DWRR_CFG,
+	QSYS_SE_CONNECT,
+	QSYS_SE_DLB_SENSE,
+	QSYS_CIR_STATE,
+	QSYS_EIR_STATE,
+	QSYS_SE_STATE,
+	QSYS_HSCH_MISC_CFG,
+	QSYS_TAG_CONFIG,
+	QSYS_TAS_PARAM_CFG_CTRL,
+	QSYS_PORT_MAX_SDU,
+	QSYS_PARAM_CFG_REG_1,
+	QSYS_PARAM_CFG_REG_2,
+	QSYS_PARAM_CFG_REG_3,
+	QSYS_PARAM_CFG_REG_4,
+	QSYS_PARAM_CFG_REG_5,
+	QSYS_GCL_CFG_REG_1,
+	QSYS_GCL_CFG_REG_2,
+	QSYS_PARAM_STATUS_REG_1,
+	QSYS_PARAM_STATUS_REG_2,
+	QSYS_PARAM_STATUS_REG_3,
+	QSYS_PARAM_STATUS_REG_4,
+	QSYS_PARAM_STATUS_REG_5,
+	QSYS_PARAM_STATUS_REG_6,
+	QSYS_PARAM_STATUS_REG_7,
+	QSYS_PARAM_STATUS_REG_8,
+	QSYS_PARAM_STATUS_REG_9,
+	QSYS_GCL_STATUS_REG_1,
+	QSYS_GCL_STATUS_REG_2,
+	REW_PORT_VLAN_CFG = REW << TARGET_OFFSET,
+	REW_TAG_CFG,
+	REW_PORT_CFG,
+	REW_DSCP_CFG,
+	REW_PCP_DEI_QOS_MAP_CFG,
+	REW_PTP_CFG,
+	REW_PTP_DLY1_CFG,
+	REW_RED_TAG_CFG,
+	REW_DSCP_REMAP_DP1_CFG,
+	REW_DSCP_REMAP_CFG,
+	REW_STAT_CFG,
+	REW_REW_STICKY,
+	REW_PPT,
+	SYS_COUNT_RX_OCTETS = SYS << TARGET_OFFSET,
+	SYS_COUNT_RX_UNICAST,
+	SYS_COUNT_RX_MULTICAST,
+	SYS_COUNT_RX_BROADCAST,
+	SYS_COUNT_RX_SHORTS,
+	SYS_COUNT_RX_FRAGMENTS,
+	SYS_COUNT_RX_JABBERS,
+	SYS_COUNT_RX_CRC_ALIGN_ERRS,
+	SYS_COUNT_RX_SYM_ERRS,
+	SYS_COUNT_RX_64,
+	SYS_COUNT_RX_65_127,
+	SYS_COUNT_RX_128_255,
+	SYS_COUNT_RX_256_1023,
+	SYS_COUNT_RX_1024_1526,
+	SYS_COUNT_RX_1527_MAX,
+	SYS_COUNT_RX_PAUSE,
+	SYS_COUNT_RX_CONTROL,
+	SYS_COUNT_RX_LONGS,
+	SYS_COUNT_RX_CLASSIFIED_DROPS,
+	SYS_COUNT_TX_OCTETS,
+	SYS_COUNT_TX_UNICAST,
+	SYS_COUNT_TX_MULTICAST,
+	SYS_COUNT_TX_BROADCAST,
+	SYS_COUNT_TX_COLLISION,
+	SYS_COUNT_TX_DROPS,
+	SYS_COUNT_TX_PAUSE,
+	SYS_COUNT_TX_64,
+	SYS_COUNT_TX_65_127,
+	SYS_COUNT_TX_128_511,
+	SYS_COUNT_TX_512_1023,
+	SYS_COUNT_TX_1024_1526,
+	SYS_COUNT_TX_1527_MAX,
+	SYS_COUNT_TX_AGING,
+	SYS_RESET_CFG,
+	SYS_SR_ETYPE_CFG,
+	SYS_VLAN_ETYPE_CFG,
+	SYS_PORT_MODE,
+	SYS_FRONT_PORT_MODE,
+	SYS_FRM_AGING,
+	SYS_STAT_CFG,
+	SYS_SW_STATUS,
+	SYS_MISC_CFG,
+	SYS_REW_MAC_HIGH_CFG,
+	SYS_REW_MAC_LOW_CFG,
+	SYS_TIMESTAMP_OFFSET,
+	SYS_CMID,
+	SYS_PAUSE_CFG,
+	SYS_PAUSE_TOT_CFG,
+	SYS_ATOP,
+	SYS_ATOP_TOT_CFG,
+	SYS_MAC_FC_CFG,
+	SYS_MMGT,
+	SYS_MMGT_FAST,
+	SYS_EVENTS_DIF,
+	SYS_EVENTS_CORE,
+	SYS_CNT,
+	SYS_PTP_STATUS,
+	SYS_PTP_TXSTAMP,
+	SYS_PTP_NXT,
+	SYS_PTP_CFG,
+	SYS_RAM_INIT,
+	SYS_CM_ADDR,
+	SYS_CM_DATA_WR,
+	SYS_CM_DATA_RD,
+	SYS_CM_OP,
+	SYS_CM_DATA,
+	S2_CORE_UPDATE_CTRL = S2 << TARGET_OFFSET,
+	S2_CORE_MV_CFG,
+	S2_CACHE_ENTRY_DAT,
+	S2_CACHE_MASK_DAT,
+	S2_CACHE_ACTION_DAT,
+	S2_CACHE_CNT_DAT,
+	S2_CACHE_TG_DAT,
+	PTP_PIN_CFG = PTP << TARGET_OFFSET,
+	PTP_PIN_TOD_SEC_MSB,
+	PTP_PIN_TOD_SEC_LSB,
+	PTP_PIN_TOD_NSEC,
+	PTP_CFG_MISC,
+	PTP_CLK_CFG_ADJ_CFG,
+	PTP_CLK_CFG_ADJ_FREQ,
+	GCB_SOFT_RST = GCB << TARGET_OFFSET,
+};
+
+enum ocelot_regfield {
+	ANA_ADVLEARN_VLAN_CHK,
+	ANA_ADVLEARN_LEARN_MIRROR,
+	ANA_ANEVENTS_FLOOD_DISCARD,
+	ANA_ANEVENTS_MSTI_DROP,
+	ANA_ANEVENTS_ACLKILL,
+	ANA_ANEVENTS_ACLUSED,
+	ANA_ANEVENTS_AUTOAGE,
+	ANA_ANEVENTS_VS2TTL1,
+	ANA_ANEVENTS_STORM_DROP,
+	ANA_ANEVENTS_LEARN_DROP,
+	ANA_ANEVENTS_AGED_ENTRY,
+	ANA_ANEVENTS_CPU_LEARN_FAILED,
+	ANA_ANEVENTS_AUTO_LEARN_FAILED,
+	ANA_ANEVENTS_LEARN_REMOVE,
+	ANA_ANEVENTS_AUTO_LEARNED,
+	ANA_ANEVENTS_AUTO_MOVED,
+	ANA_ANEVENTS_DROPPED,
+	ANA_ANEVENTS_CLASSIFIED_DROP,
+	ANA_ANEVENTS_CLASSIFIED_COPY,
+	ANA_ANEVENTS_VLAN_DISCARD,
+	ANA_ANEVENTS_FWD_DISCARD,
+	ANA_ANEVENTS_MULTICAST_FLOOD,
+	ANA_ANEVENTS_UNICAST_FLOOD,
+	ANA_ANEVENTS_DEST_KNOWN,
+	ANA_ANEVENTS_BUCKET3_MATCH,
+	ANA_ANEVENTS_BUCKET2_MATCH,
+	ANA_ANEVENTS_BUCKET1_MATCH,
+	ANA_ANEVENTS_BUCKET0_MATCH,
+	ANA_ANEVENTS_CPU_OPERATION,
+	ANA_ANEVENTS_DMAC_LOOKUP,
+	ANA_ANEVENTS_SMAC_LOOKUP,
+	ANA_ANEVENTS_SEQ_GEN_ERR_0,
+	ANA_ANEVENTS_SEQ_GEN_ERR_1,
+	ANA_TABLES_MACACCESS_B_DOM,
+	ANA_TABLES_MACTINDX_BUCKET,
+	ANA_TABLES_MACTINDX_M_INDEX,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_VLD,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_FP,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_PORTNO,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_TM_SEL,
+	QSYS_TIMED_FRAME_ENTRY_TFRM_TM_T,
+	SYS_RESET_CFG_CORE_ENA,
+	SYS_RESET_CFG_MEM_ENA,
+	SYS_RESET_CFG_MEM_INIT,
+	GCB_SOFT_RST_SWC_RST,
+	REGFIELD_MAX
+};
+
+enum ocelot_clk_pins {
+	ALT_PPS_PIN	= 1,
+	EXT_CLK_PIN,
+	ALT_LDST_PIN,
+	TOD_ACC_PIN
+};
+
+struct ocelot_stat_layout {
+	u32 offset;
+	char name[ETH_GSTRING_LEN];
+};
+
+enum ocelot_tag_prefix {
+	OCELOT_TAG_PREFIX_DISABLED	= 0,
+	OCELOT_TAG_PREFIX_NONE,
+	OCELOT_TAG_PREFIX_SHORT,
+	OCELOT_TAG_PREFIX_LONG,
+};
+
+struct ocelot;
+
+struct ocelot_ops {
+	void (*pcs_init)(struct ocelot *ocelot, int port);
+	int (*reset)(struct ocelot *ocelot);
+};
+
+struct ocelot_port {
+	struct ocelot			*ocelot;
+
+	void __iomem			*regs;
+
+	/* Ingress default VLAN (pvid) */
+	u16				pvid;
+
+	/* Egress default VLAN (vid) */
+	u16				vid;
+
+	u8				ptp_cmd;
+	struct list_head		skbs;
+	u8				ts_id;
+};
+
+struct ocelot {
+	struct device			*dev;
+
+	const struct ocelot_ops		*ops;
+	struct regmap			*targets[TARGET_MAX];
+	struct regmap_field		*regfields[REGFIELD_MAX];
+	const u32 *const		*map;
+	const struct ocelot_stat_layout	*stats_layout;
+	unsigned int			num_stats;
+
+	int				shared_queue_sz;
+
+	struct net_device		*hw_bridge_dev;
+	u16				bridge_mask;
+	u16				bridge_fwd_mask;
+
+	struct ocelot_port		**ports;
+
+	u8				base_mac[ETH_ALEN];
+
+	/* Keep track of the vlan port masks */
+	u32				vlan_mask[VLAN_N_VID];
+
+	u8				num_phys_ports;
+	u8				num_cpu_ports;
+	u8				cpu;
+
+	u32				*lags;
+
+	struct list_head		multicast;
+
+	/* Workqueue to check statistics for overflow with its lock */
+	struct mutex			stats_lock;
+	u64				*stats;
+	struct delayed_work		stats_work;
+	struct workqueue_struct		*stats_queue;
+
+	u8				ptp:1;
+	struct ptp_clock		*ptp_clock;
+	struct ptp_clock_info		ptp_info;
+	struct hwtstamp_config		hwtstamp_config;
+	/* Protects the PTP interface state */
+	struct mutex			ptp_lock;
+	/* Protects the PTP clock */
+	spinlock_t			ptp_clock_lock;
+
+	void (*port_pcs_init)(struct ocelot_port *port);
+};
+
+#define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
+#define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi))
+#define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri))
+#define ocelot_read(ocelot, reg) __ocelot_read_ix(ocelot, reg, 0)
+
+#define ocelot_write_ix(ocelot, val, reg, gi, ri) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
+#define ocelot_write_gix(ocelot, val, reg, gi) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi))
+#define ocelot_write_rix(ocelot, val, reg, ri) __ocelot_write_ix(ocelot, val, reg, reg##_RSZ * (ri))
+#define ocelot_write(ocelot, val, reg) __ocelot_write_ix(ocelot, val, reg, 0)
+
+#define ocelot_rmw_ix(ocelot, val, m, reg, gi, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri))
+#define ocelot_rmw_gix(ocelot, val, m, reg, gi) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi))
+#define ocelot_rmw_rix(ocelot, val, m, reg, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_RSZ * (ri))
+#define ocelot_rmw(ocelot, val, m, reg) __ocelot_rmw_ix(ocelot, val, m, reg, 0)
+
+/* I/O */
+u32 ocelot_port_readl(struct ocelot_port *port, u32 reg);
+void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg);
+u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset);
+void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset);
+void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
+		     u32 offset);
+
+/* Hardware initialization */
+int ocelot_regfields_init(struct ocelot *ocelot,
+			  const struct reg_field *const regfields);
+struct regmap *ocelot_regmap_init(struct ocelot *ocelot, struct resource *res);
+void ocelot_set_cpu_port(struct ocelot *ocelot, int cpu,
+			 enum ocelot_tag_prefix injection,
+			 enum ocelot_tag_prefix extraction);
+int ocelot_init(struct ocelot *ocelot);
+void ocelot_deinit(struct ocelot *ocelot);
+void ocelot_init_port(struct ocelot *ocelot, int port);
+
+/* DSA callbacks */
+void ocelot_port_enable(struct ocelot *ocelot, int port,
+			struct phy_device *phy);
+void ocelot_port_disable(struct ocelot *ocelot, int port);
+void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data);
+void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data);
+int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset);
+int ocelot_get_ts_info(struct ocelot *ocelot, int port,
+		       struct ethtool_ts_info *info);
+void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs);
+void ocelot_adjust_link(struct ocelot *ocelot, int port,
+			struct phy_device *phydev);
+void ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
+				bool vlan_aware);
+void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state);
+int ocelot_port_bridge_join(struct ocelot *ocelot, int port,
+			    struct net_device *bridge);
+int ocelot_port_bridge_leave(struct ocelot *ocelot, int port,
+			     struct net_device *bridge);
+int ocelot_fdb_dump(struct ocelot *ocelot, int port,
+		    dsa_fdb_dump_cb_t *cb, void *data);
+int ocelot_fdb_add(struct ocelot *ocelot, int port,
+		   const unsigned char *addr, u16 vid, bool vlan_aware);
+int ocelot_fdb_del(struct ocelot *ocelot, int port,
+		   const unsigned char *addr, u16 vid);
+int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid,
+		    bool untagged);
+int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid);
+int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts);
+void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts);
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From a030dfe1947310a2140b9e371dc9ebfab72c914f Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 14 Nov 2019 17:03:28 +0200
Subject: net: mscc: ocelot: publish ocelot_sys.h to include/soc/mscc

The Felix DSA driver needs to write to SYS_RAM_INIT_RAM_INIT for its own
chip initialization process.

Also update the MAINTAINERS file such that the headers exported by the
ocelot driver are under the same maintainers' umbrella as the driver
itself.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                            |   1 +
 drivers/net/ethernet/mscc/ocelot.h     |   2 +-
 drivers/net/ethernet/mscc/ocelot_sys.h | 144 ---------------------------------
 include/soc/mscc/ocelot_sys.h          | 144 +++++++++++++++++++++++++++++++++
 4 files changed, 146 insertions(+), 145 deletions(-)
 delete mode 100644 drivers/net/ethernet/mscc/ocelot_sys.h
 create mode 100644 include/soc/mscc/ocelot_sys.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 3e57fc1d9962..d09a3205da37 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10834,6 +10834,7 @@ M:	Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
 L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/mscc/
+F:	include/soc/mscc/ocelot*
 
 MICROSOFT SURFACE PRO 3 BUTTON DRIVER
 M:	Chen Yu <yu.c.chen@intel.com>
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index 325afea3e846..32fef4f495aa 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -18,12 +18,12 @@
 #include <linux/ptp_clock_kernel.h>
 #include <linux/regmap.h>
 
+#include <soc/mscc/ocelot_sys.h>
 #include <soc/mscc/ocelot.h>
 #include "ocelot_ana.h"
 #include "ocelot_dev.h"
 #include "ocelot_qsys.h"
 #include "ocelot_rew.h"
-#include "ocelot_sys.h"
 #include "ocelot_qs.h"
 #include "ocelot_tc.h"
 #include "ocelot_ptp.h"
diff --git a/drivers/net/ethernet/mscc/ocelot_sys.h b/drivers/net/ethernet/mscc/ocelot_sys.h
deleted file mode 100644
index 16f91e172bcb..000000000000
--- a/drivers/net/ethernet/mscc/ocelot_sys.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
-/*
- * Microsemi Ocelot Switch driver
- *
- * Copyright (c) 2017 Microsemi Corporation
- */
-
-#ifndef _MSCC_OCELOT_SYS_H_
-#define _MSCC_OCELOT_SYS_H_
-
-#define SYS_COUNT_RX_OCTETS_RSZ                           0x4
-
-#define SYS_COUNT_TX_OCTETS_RSZ                           0x4
-
-#define SYS_PORT_MODE_RSZ                                 0x4
-
-#define SYS_PORT_MODE_DATA_WO_TS(x)                       (((x) << 5) & GENMASK(6, 5))
-#define SYS_PORT_MODE_DATA_WO_TS_M                        GENMASK(6, 5)
-#define SYS_PORT_MODE_DATA_WO_TS_X(x)                     (((x) & GENMASK(6, 5)) >> 5)
-#define SYS_PORT_MODE_INCL_INJ_HDR(x)                     (((x) << 3) & GENMASK(4, 3))
-#define SYS_PORT_MODE_INCL_INJ_HDR_M                      GENMASK(4, 3)
-#define SYS_PORT_MODE_INCL_INJ_HDR_X(x)                   (((x) & GENMASK(4, 3)) >> 3)
-#define SYS_PORT_MODE_INCL_XTR_HDR(x)                     (((x) << 1) & GENMASK(2, 1))
-#define SYS_PORT_MODE_INCL_XTR_HDR_M                      GENMASK(2, 1)
-#define SYS_PORT_MODE_INCL_XTR_HDR_X(x)                   (((x) & GENMASK(2, 1)) >> 1)
-#define SYS_PORT_MODE_INJ_HDR_ERR                         BIT(0)
-
-#define SYS_FRONT_PORT_MODE_RSZ                           0x4
-
-#define SYS_FRONT_PORT_MODE_HDX_MODE                      BIT(0)
-
-#define SYS_FRM_AGING_AGE_TX_ENA                          BIT(20)
-#define SYS_FRM_AGING_MAX_AGE(x)                          ((x) & GENMASK(19, 0))
-#define SYS_FRM_AGING_MAX_AGE_M                           GENMASK(19, 0)
-
-#define SYS_STAT_CFG_STAT_CLEAR_SHOT(x)                   (((x) << 10) & GENMASK(16, 10))
-#define SYS_STAT_CFG_STAT_CLEAR_SHOT_M                    GENMASK(16, 10)
-#define SYS_STAT_CFG_STAT_CLEAR_SHOT_X(x)                 (((x) & GENMASK(16, 10)) >> 10)
-#define SYS_STAT_CFG_STAT_VIEW(x)                         ((x) & GENMASK(9, 0))
-#define SYS_STAT_CFG_STAT_VIEW_M                          GENMASK(9, 0)
-
-#define SYS_SW_STATUS_RSZ                                 0x4
-
-#define SYS_SW_STATUS_PORT_RX_PAUSED                      BIT(0)
-
-#define SYS_MISC_CFG_PTP_RSRV_CLR                         BIT(1)
-#define SYS_MISC_CFG_PTP_DIS_NEG_RO                       BIT(0)
-
-#define SYS_REW_MAC_HIGH_CFG_RSZ                          0x4
-
-#define SYS_REW_MAC_LOW_CFG_RSZ                           0x4
-
-#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG(x)              (((x) << 6) & GENMASK(21, 6))
-#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_M               GENMASK(21, 6)
-#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_X(x)            (((x) & GENMASK(21, 6)) >> 6)
-#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET(x)          ((x) & GENMASK(5, 0))
-#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET_M           GENMASK(5, 0)
-
-#define SYS_PAUSE_CFG_RSZ                                 0x4
-
-#define SYS_PAUSE_CFG_PAUSE_START(x)                      (((x) << 10) & GENMASK(18, 10))
-#define SYS_PAUSE_CFG_PAUSE_START_M                       GENMASK(18, 10)
-#define SYS_PAUSE_CFG_PAUSE_START_X(x)                    (((x) & GENMASK(18, 10)) >> 10)
-#define SYS_PAUSE_CFG_PAUSE_STOP(x)                       (((x) << 1) & GENMASK(9, 1))
-#define SYS_PAUSE_CFG_PAUSE_STOP_M                        GENMASK(9, 1)
-#define SYS_PAUSE_CFG_PAUSE_STOP_X(x)                     (((x) & GENMASK(9, 1)) >> 1)
-#define SYS_PAUSE_CFG_PAUSE_ENA                           BIT(0)
-
-#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START(x)              (((x) << 9) & GENMASK(17, 9))
-#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_M               GENMASK(17, 9)
-#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_X(x)            (((x) & GENMASK(17, 9)) >> 9)
-#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP(x)               ((x) & GENMASK(8, 0))
-#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP_M                GENMASK(8, 0)
-
-#define SYS_ATOP_RSZ                                      0x4
-
-#define SYS_MAC_FC_CFG_RSZ                                0x4
-
-#define SYS_MAC_FC_CFG_FC_LINK_SPEED(x)                   (((x) << 26) & GENMASK(27, 26))
-#define SYS_MAC_FC_CFG_FC_LINK_SPEED_M                    GENMASK(27, 26)
-#define SYS_MAC_FC_CFG_FC_LINK_SPEED_X(x)                 (((x) & GENMASK(27, 26)) >> 26)
-#define SYS_MAC_FC_CFG_FC_LATENCY_CFG(x)                  (((x) << 20) & GENMASK(25, 20))
-#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_M                   GENMASK(25, 20)
-#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_X(x)                (((x) & GENMASK(25, 20)) >> 20)
-#define SYS_MAC_FC_CFG_ZERO_PAUSE_ENA                     BIT(18)
-#define SYS_MAC_FC_CFG_TX_FC_ENA                          BIT(17)
-#define SYS_MAC_FC_CFG_RX_FC_ENA                          BIT(16)
-#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG(x)                   ((x) & GENMASK(15, 0))
-#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG_M                    GENMASK(15, 0)
-
-#define SYS_MMGT_RELCNT(x)                                (((x) << 16) & GENMASK(31, 16))
-#define SYS_MMGT_RELCNT_M                                 GENMASK(31, 16)
-#define SYS_MMGT_RELCNT_X(x)                              (((x) & GENMASK(31, 16)) >> 16)
-#define SYS_MMGT_FREECNT(x)                               ((x) & GENMASK(15, 0))
-#define SYS_MMGT_FREECNT_M                                GENMASK(15, 0)
-
-#define SYS_MMGT_FAST_FREEVLD(x)                          (((x) << 4) & GENMASK(7, 4))
-#define SYS_MMGT_FAST_FREEVLD_M                           GENMASK(7, 4)
-#define SYS_MMGT_FAST_FREEVLD_X(x)                        (((x) & GENMASK(7, 4)) >> 4)
-#define SYS_MMGT_FAST_RELVLD(x)                           ((x) & GENMASK(3, 0))
-#define SYS_MMGT_FAST_RELVLD_M                            GENMASK(3, 0)
-
-#define SYS_EVENTS_DIF_RSZ                                0x4
-
-#define SYS_EVENTS_DIF_EV_DRX(x)                          (((x) << 6) & GENMASK(8, 6))
-#define SYS_EVENTS_DIF_EV_DRX_M                           GENMASK(8, 6)
-#define SYS_EVENTS_DIF_EV_DRX_X(x)                        (((x) & GENMASK(8, 6)) >> 6)
-#define SYS_EVENTS_DIF_EV_DTX(x)                          ((x) & GENMASK(5, 0))
-#define SYS_EVENTS_DIF_EV_DTX_M                           GENMASK(5, 0)
-
-#define SYS_EVENTS_CORE_EV_FWR                            BIT(2)
-#define SYS_EVENTS_CORE_EV_ANA(x)                         ((x) & GENMASK(1, 0))
-#define SYS_EVENTS_CORE_EV_ANA_M                          GENMASK(1, 0)
-
-#define SYS_CNT_GSZ                                       0x4
-
-#define SYS_PTP_STATUS_PTP_TXSTAMP_OAM                    BIT(29)
-#define SYS_PTP_STATUS_PTP_OVFL                           BIT(28)
-#define SYS_PTP_STATUS_PTP_MESS_VLD                       BIT(27)
-#define SYS_PTP_STATUS_PTP_MESS_ID(x)                     (((x) << 21) & GENMASK(26, 21))
-#define SYS_PTP_STATUS_PTP_MESS_ID_M                      GENMASK(26, 21)
-#define SYS_PTP_STATUS_PTP_MESS_ID_X(x)                   (((x) & GENMASK(26, 21)) >> 21)
-#define SYS_PTP_STATUS_PTP_MESS_TXPORT(x)                 (((x) << 16) & GENMASK(20, 16))
-#define SYS_PTP_STATUS_PTP_MESS_TXPORT_M                  GENMASK(20, 16)
-#define SYS_PTP_STATUS_PTP_MESS_TXPORT_X(x)               (((x) & GENMASK(20, 16)) >> 16)
-#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID(x)                 ((x) & GENMASK(15, 0))
-#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID_M                  GENMASK(15, 0)
-
-#define SYS_PTP_TXSTAMP_PTP_TXSTAMP(x)                    ((x) & GENMASK(29, 0))
-#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_M                     GENMASK(29, 0)
-#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_SEC                   BIT(31)
-
-#define SYS_PTP_NXT_PTP_NXT                               BIT(0)
-
-#define SYS_PTP_CFG_PTP_STAMP_WID(x)                      (((x) << 2) & GENMASK(7, 2))
-#define SYS_PTP_CFG_PTP_STAMP_WID_M                       GENMASK(7, 2)
-#define SYS_PTP_CFG_PTP_STAMP_WID_X(x)                    (((x) & GENMASK(7, 2)) >> 2)
-#define SYS_PTP_CFG_PTP_CF_ROLL_MODE(x)                   ((x) & GENMASK(1, 0))
-#define SYS_PTP_CFG_PTP_CF_ROLL_MODE_M                    GENMASK(1, 0)
-
-#define SYS_RAM_INIT_RAM_INIT                             BIT(1)
-#define SYS_RAM_INIT_RAM_CFG_HOOK                         BIT(0)
-
-#endif
diff --git a/include/soc/mscc/ocelot_sys.h b/include/soc/mscc/ocelot_sys.h
new file mode 100644
index 000000000000..16f91e172bcb
--- /dev/null
+++ b/include/soc/mscc/ocelot_sys.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */
+/*
+ * Microsemi Ocelot Switch driver
+ *
+ * Copyright (c) 2017 Microsemi Corporation
+ */
+
+#ifndef _MSCC_OCELOT_SYS_H_
+#define _MSCC_OCELOT_SYS_H_
+
+#define SYS_COUNT_RX_OCTETS_RSZ                           0x4
+
+#define SYS_COUNT_TX_OCTETS_RSZ                           0x4
+
+#define SYS_PORT_MODE_RSZ                                 0x4
+
+#define SYS_PORT_MODE_DATA_WO_TS(x)                       (((x) << 5) & GENMASK(6, 5))
+#define SYS_PORT_MODE_DATA_WO_TS_M                        GENMASK(6, 5)
+#define SYS_PORT_MODE_DATA_WO_TS_X(x)                     (((x) & GENMASK(6, 5)) >> 5)
+#define SYS_PORT_MODE_INCL_INJ_HDR(x)                     (((x) << 3) & GENMASK(4, 3))
+#define SYS_PORT_MODE_INCL_INJ_HDR_M                      GENMASK(4, 3)
+#define SYS_PORT_MODE_INCL_INJ_HDR_X(x)                   (((x) & GENMASK(4, 3)) >> 3)
+#define SYS_PORT_MODE_INCL_XTR_HDR(x)                     (((x) << 1) & GENMASK(2, 1))
+#define SYS_PORT_MODE_INCL_XTR_HDR_M                      GENMASK(2, 1)
+#define SYS_PORT_MODE_INCL_XTR_HDR_X(x)                   (((x) & GENMASK(2, 1)) >> 1)
+#define SYS_PORT_MODE_INJ_HDR_ERR                         BIT(0)
+
+#define SYS_FRONT_PORT_MODE_RSZ                           0x4
+
+#define SYS_FRONT_PORT_MODE_HDX_MODE                      BIT(0)
+
+#define SYS_FRM_AGING_AGE_TX_ENA                          BIT(20)
+#define SYS_FRM_AGING_MAX_AGE(x)                          ((x) & GENMASK(19, 0))
+#define SYS_FRM_AGING_MAX_AGE_M                           GENMASK(19, 0)
+
+#define SYS_STAT_CFG_STAT_CLEAR_SHOT(x)                   (((x) << 10) & GENMASK(16, 10))
+#define SYS_STAT_CFG_STAT_CLEAR_SHOT_M                    GENMASK(16, 10)
+#define SYS_STAT_CFG_STAT_CLEAR_SHOT_X(x)                 (((x) & GENMASK(16, 10)) >> 10)
+#define SYS_STAT_CFG_STAT_VIEW(x)                         ((x) & GENMASK(9, 0))
+#define SYS_STAT_CFG_STAT_VIEW_M                          GENMASK(9, 0)
+
+#define SYS_SW_STATUS_RSZ                                 0x4
+
+#define SYS_SW_STATUS_PORT_RX_PAUSED                      BIT(0)
+
+#define SYS_MISC_CFG_PTP_RSRV_CLR                         BIT(1)
+#define SYS_MISC_CFG_PTP_DIS_NEG_RO                       BIT(0)
+
+#define SYS_REW_MAC_HIGH_CFG_RSZ                          0x4
+
+#define SYS_REW_MAC_LOW_CFG_RSZ                           0x4
+
+#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG(x)              (((x) << 6) & GENMASK(21, 6))
+#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_M               GENMASK(21, 6)
+#define SYS_TIMESTAMP_OFFSET_ETH_TYPE_CFG_X(x)            (((x) & GENMASK(21, 6)) >> 6)
+#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET(x)          ((x) & GENMASK(5, 0))
+#define SYS_TIMESTAMP_OFFSET_TIMESTAMP_OFFSET_M           GENMASK(5, 0)
+
+#define SYS_PAUSE_CFG_RSZ                                 0x4
+
+#define SYS_PAUSE_CFG_PAUSE_START(x)                      (((x) << 10) & GENMASK(18, 10))
+#define SYS_PAUSE_CFG_PAUSE_START_M                       GENMASK(18, 10)
+#define SYS_PAUSE_CFG_PAUSE_START_X(x)                    (((x) & GENMASK(18, 10)) >> 10)
+#define SYS_PAUSE_CFG_PAUSE_STOP(x)                       (((x) << 1) & GENMASK(9, 1))
+#define SYS_PAUSE_CFG_PAUSE_STOP_M                        GENMASK(9, 1)
+#define SYS_PAUSE_CFG_PAUSE_STOP_X(x)                     (((x) & GENMASK(9, 1)) >> 1)
+#define SYS_PAUSE_CFG_PAUSE_ENA                           BIT(0)
+
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START(x)              (((x) << 9) & GENMASK(17, 9))
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_M               GENMASK(17, 9)
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_START_X(x)            (((x) & GENMASK(17, 9)) >> 9)
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP(x)               ((x) & GENMASK(8, 0))
+#define SYS_PAUSE_TOT_CFG_PAUSE_TOT_STOP_M                GENMASK(8, 0)
+
+#define SYS_ATOP_RSZ                                      0x4
+
+#define SYS_MAC_FC_CFG_RSZ                                0x4
+
+#define SYS_MAC_FC_CFG_FC_LINK_SPEED(x)                   (((x) << 26) & GENMASK(27, 26))
+#define SYS_MAC_FC_CFG_FC_LINK_SPEED_M                    GENMASK(27, 26)
+#define SYS_MAC_FC_CFG_FC_LINK_SPEED_X(x)                 (((x) & GENMASK(27, 26)) >> 26)
+#define SYS_MAC_FC_CFG_FC_LATENCY_CFG(x)                  (((x) << 20) & GENMASK(25, 20))
+#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_M                   GENMASK(25, 20)
+#define SYS_MAC_FC_CFG_FC_LATENCY_CFG_X(x)                (((x) & GENMASK(25, 20)) >> 20)
+#define SYS_MAC_FC_CFG_ZERO_PAUSE_ENA                     BIT(18)
+#define SYS_MAC_FC_CFG_TX_FC_ENA                          BIT(17)
+#define SYS_MAC_FC_CFG_RX_FC_ENA                          BIT(16)
+#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG(x)                   ((x) & GENMASK(15, 0))
+#define SYS_MAC_FC_CFG_PAUSE_VAL_CFG_M                    GENMASK(15, 0)
+
+#define SYS_MMGT_RELCNT(x)                                (((x) << 16) & GENMASK(31, 16))
+#define SYS_MMGT_RELCNT_M                                 GENMASK(31, 16)
+#define SYS_MMGT_RELCNT_X(x)                              (((x) & GENMASK(31, 16)) >> 16)
+#define SYS_MMGT_FREECNT(x)                               ((x) & GENMASK(15, 0))
+#define SYS_MMGT_FREECNT_M                                GENMASK(15, 0)
+
+#define SYS_MMGT_FAST_FREEVLD(x)                          (((x) << 4) & GENMASK(7, 4))
+#define SYS_MMGT_FAST_FREEVLD_M                           GENMASK(7, 4)
+#define SYS_MMGT_FAST_FREEVLD_X(x)                        (((x) & GENMASK(7, 4)) >> 4)
+#define SYS_MMGT_FAST_RELVLD(x)                           ((x) & GENMASK(3, 0))
+#define SYS_MMGT_FAST_RELVLD_M                            GENMASK(3, 0)
+
+#define SYS_EVENTS_DIF_RSZ                                0x4
+
+#define SYS_EVENTS_DIF_EV_DRX(x)                          (((x) << 6) & GENMASK(8, 6))
+#define SYS_EVENTS_DIF_EV_DRX_M                           GENMASK(8, 6)
+#define SYS_EVENTS_DIF_EV_DRX_X(x)                        (((x) & GENMASK(8, 6)) >> 6)
+#define SYS_EVENTS_DIF_EV_DTX(x)                          ((x) & GENMASK(5, 0))
+#define SYS_EVENTS_DIF_EV_DTX_M                           GENMASK(5, 0)
+
+#define SYS_EVENTS_CORE_EV_FWR                            BIT(2)
+#define SYS_EVENTS_CORE_EV_ANA(x)                         ((x) & GENMASK(1, 0))
+#define SYS_EVENTS_CORE_EV_ANA_M                          GENMASK(1, 0)
+
+#define SYS_CNT_GSZ                                       0x4
+
+#define SYS_PTP_STATUS_PTP_TXSTAMP_OAM                    BIT(29)
+#define SYS_PTP_STATUS_PTP_OVFL                           BIT(28)
+#define SYS_PTP_STATUS_PTP_MESS_VLD                       BIT(27)
+#define SYS_PTP_STATUS_PTP_MESS_ID(x)                     (((x) << 21) & GENMASK(26, 21))
+#define SYS_PTP_STATUS_PTP_MESS_ID_M                      GENMASK(26, 21)
+#define SYS_PTP_STATUS_PTP_MESS_ID_X(x)                   (((x) & GENMASK(26, 21)) >> 21)
+#define SYS_PTP_STATUS_PTP_MESS_TXPORT(x)                 (((x) << 16) & GENMASK(20, 16))
+#define SYS_PTP_STATUS_PTP_MESS_TXPORT_M                  GENMASK(20, 16)
+#define SYS_PTP_STATUS_PTP_MESS_TXPORT_X(x)               (((x) & GENMASK(20, 16)) >> 16)
+#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID(x)                 ((x) & GENMASK(15, 0))
+#define SYS_PTP_STATUS_PTP_MESS_SEQ_ID_M                  GENMASK(15, 0)
+
+#define SYS_PTP_TXSTAMP_PTP_TXSTAMP(x)                    ((x) & GENMASK(29, 0))
+#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_M                     GENMASK(29, 0)
+#define SYS_PTP_TXSTAMP_PTP_TXSTAMP_SEC                   BIT(31)
+
+#define SYS_PTP_NXT_PTP_NXT                               BIT(0)
+
+#define SYS_PTP_CFG_PTP_STAMP_WID(x)                      (((x) << 2) & GENMASK(7, 2))
+#define SYS_PTP_CFG_PTP_STAMP_WID_M                       GENMASK(7, 2)
+#define SYS_PTP_CFG_PTP_STAMP_WID_X(x)                    (((x) & GENMASK(7, 2)) >> 2)
+#define SYS_PTP_CFG_PTP_CF_ROLL_MODE(x)                   ((x) & GENMASK(1, 0))
+#define SYS_PTP_CFG_PTP_CF_ROLL_MODE_M                    GENMASK(1, 0)
+
+#define SYS_RAM_INIT_RAM_INIT                             BIT(1)
+#define SYS_RAM_INIT_RAM_CFG_HOOK                         BIT(0)
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 8dce89aa5f3274e7c26132433840f63d129406bb Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 14 Nov 2019 17:03:29 +0200
Subject: net: dsa: ocelot: add tagger for Ocelot/Felix switches

While it is entirely possible that this tagger format is in fact more
generic than just these 2 switch families, I don't have that knowledge.
The Seville switch in NXP T1040 has a similar frame format, but there
are enough differences (e.g. DEST field starts at bit 57 instead of 56)
that calling this file tag_vitesse.c is a bit of a stretch at the
moment. The frame format has been listed in a comment so that people who
add support for further Vitesse switches can rework this tagger while
keeping compatibility with Felix.

The "ocelot" name was chosen instead of "felix" because even the Ocelot
switch can act as a DSA device when it is used in NPI mode, and the Felix
tagger format is almost identical. Currently it is only used for the
Felix switch embedded in the NXP LS1028A chip.

The ABI for this tagger should be considered "not stable" at the moment.
The DSA tag is always placed before the Ethernet header and therefore,
we are using the long prefix for RX tags to avoid putting the DSA master
port in promiscuous mode. Once there will be an API in DSA for drivers
to request DSA masters to be in promiscuous mode unconditionally, we
will switch to the "no prefix" extraction frame header, which will save
16 padding bytes for each RX frame.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS          |   7 ++
 include/net/dsa.h    |   2 +
 net/dsa/Kconfig      |   7 ++
 net/dsa/Makefile     |   1 +
 net/dsa/tag_ocelot.c | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 246 insertions(+)
 create mode 100644 net/dsa/tag_ocelot.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index d09a3205da37..112befcb712a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17360,6 +17360,13 @@ S:	Maintained
 F:	drivers/input/serio/userio.c
 F:	include/uapi/linux/userio.h
 
+VITESSE FELIX ETHERNET SWITCH DRIVER
+M:	Vladimir Oltean <vladimir.oltean@nxp.com>
+M:	Claudiu Manoil <claudiu.manoil@nxp.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	net/dsa/tag_ocelot.c
+
 VIVID VIRTUAL VIDEO DRIVER
 M:	Hans Verkuil <hverkuil@xs4all.nl>
 L:	linux-media@vger.kernel.org
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 9507611a41f0..6767dc3f66c0 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -42,6 +42,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_8021Q_VALUE		12
 #define DSA_TAG_PROTO_SJA1105_VALUE		13
 #define DSA_TAG_PROTO_KSZ8795_VALUE		14
+#define DSA_TAG_PROTO_OCELOT_VALUE		15
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -59,6 +60,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_8021Q		= DSA_TAG_PROTO_8021Q_VALUE,
 	DSA_TAG_PROTO_SJA1105		= DSA_TAG_PROTO_SJA1105_VALUE,
 	DSA_TAG_PROTO_KSZ8795		= DSA_TAG_PROTO_KSZ8795_VALUE,
+	DSA_TAG_PROTO_OCELOT		= DSA_TAG_PROTO_OCELOT_VALUE,
 };
 
 struct packet_type;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 136612792c08..1e6c3cac11e6 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -79,6 +79,13 @@ config NET_DSA_TAG_KSZ
 	  Say Y if you want to enable support for tagging frames for the
 	  Microchip 8795/9477/9893 families of switches.
 
+config NET_DSA_TAG_OCELOT
+	tristate "Tag driver for Ocelot family of switches"
+	select PACKING
+	help
+	  Say Y or M if you want to enable support for tagging frames for the
+	  Ocelot switches (VSC7511, VSC7512, VSC7513, VSC7514, VSC9959).
+
 config NET_DSA_TAG_QCA
 	tristate "Tag driver for Qualcomm Atheros QCA8K switches"
 	help
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 2c6d286f0511..9a482c38bdb1 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
 obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
 obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
 obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
+obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o
 obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
 obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
 obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
new file mode 100644
index 000000000000..078d4790669d
--- /dev/null
+++ b/net/dsa/tag_ocelot.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 NXP Semiconductors
+ */
+#include <soc/mscc/ocelot.h>
+#include <linux/packing.h>
+#include "dsa_priv.h"
+
+/* The CPU injection header and the CPU extraction header can have 3 types of
+ * prefixes: long, short and no prefix. The format of the header itself is the
+ * same in all 3 cases.
+ *
+ * Extraction with long prefix:
+ *
+ * +-------------------+-------------------+------+------+------------+-------+
+ * | ff:ff:ff:ff:ff:ff | ff:ff:ff:ff:ff:ff | 8880 | 000a | extraction | frame |
+ * |                   |                   |      |      |   header   |       |
+ * +-------------------+-------------------+------+------+------------+-------+
+ *        48 bits             48 bits      16 bits 16 bits  128 bits
+ *
+ * Extraction with short prefix:
+ *
+ *                                         +------+------+------------+-------+
+ *                                         | 8880 | 000a | extraction | frame |
+ *                                         |      |      |   header   |       |
+ *                                         +------+------+------------+-------+
+ *                                         16 bits 16 bits  128 bits
+ *
+ * Extraction with no prefix:
+ *
+ *                                                       +------------+-------+
+ *                                                       | extraction | frame |
+ *                                                       |   header   |       |
+ *                                                       +------------+-------+
+ *                                                          128 bits
+ *
+ *
+ * Injection with long prefix:
+ *
+ * +-------------------+-------------------+------+------+------------+-------+
+ * |      any dmac     |      any smac     | 8880 | 000a | injection  | frame |
+ * |                   |                   |      |      |   header   |       |
+ * +-------------------+-------------------+------+------+------------+-------+
+ *        48 bits             48 bits      16 bits 16 bits  128 bits
+ *
+ * Injection with short prefix:
+ *
+ *                                         +------+------+------------+-------+
+ *                                         | 8880 | 000a | injection  | frame |
+ *                                         |      |      |   header   |       |
+ *                                         +------+------+------------+-------+
+ *                                         16 bits 16 bits  128 bits
+ *
+ * Injection with no prefix:
+ *
+ *                                                       +------------+-------+
+ *                                                       | injection  | frame |
+ *                                                       |   header   |       |
+ *                                                       +------------+-------+
+ *                                                          128 bits
+ *
+ * The injection header looks like this (network byte order, bit 127
+ * is part of lowest address byte in memory, bit 0 is part of highest
+ * address byte):
+ *
+ *         +------+------+------+------+------+------+------+------+
+ * 127:120 |BYPASS| MASQ |          MASQ_PORT        |REW_OP|REW_OP|
+ *         +------+------+------+------+------+------+------+------+
+ * 119:112 |                         REW_OP                        |
+ *         +------+------+------+------+------+------+------+------+
+ * 111:104 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ * 103: 96 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  95: 88 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  87: 80 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  79: 72 |                          RSV                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  71: 64 |            RSV            |           DEST            |
+ *         +------+------+------+------+------+------+------+------+
+ *  63: 56 |                         DEST                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  55: 48 |                          RSV                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  47: 40 |  RSV |         SRC_PORT          |     RSV     |TFRM_TIMER|
+ *         +------+------+------+------+------+------+------+------+
+ *  39: 32 |     TFRM_TIMER     |               RSV                |
+ *         +------+------+------+------+------+------+------+------+
+ *  31: 24 |  RSV |  DP  |   POP_CNT   |           CPUQ            |
+ *         +------+------+------+------+------+------+------+------+
+ *  23: 16 |           CPUQ            |      QOS_CLASS     |TAG_TYPE|
+ *         +------+------+------+------+------+------+------+------+
+ *  15:  8 |         PCP        |  DEI |            VID            |
+ *         +------+------+------+------+------+------+------+------+
+ *   7:  0 |                          VID                          |
+ *         +------+------+------+------+------+------+------+------+
+ *
+ * And the extraction header looks like this:
+ *
+ *         +------+------+------+------+------+------+------+------+
+ * 127:120 |  RSV |                  REW_OP                        |
+ *         +------+------+------+------+------+------+------+------+
+ * 119:112 |       REW_OP       |              REW_VAL             |
+ *         +------+------+------+------+------+------+------+------+
+ * 111:104 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ * 103: 96 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  95: 88 |                         REW_VAL                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  87: 80 |       REW_VAL      |               LLEN               |
+ *         +------+------+------+------+------+------+------+------+
+ *  79: 72 | LLEN |                      WLEN                      |
+ *         +------+------+------+------+------+------+------+------+
+ *  71: 64 | WLEN |                      RSV                       |
+ *         +------+------+------+------+------+------+------+------+
+ *  63: 56 |                          RSV                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  55: 48 |                          RSV                          |
+ *         +------+------+------+------+------+------+------+------+
+ *  47: 40 | RSV  |          SRC_PORT         |       ACL_ID       |
+ *         +------+------+------+------+------+------+------+------+
+ *  39: 32 |       ACL_ID       |  RSV |         SFLOW_ID          |
+ *         +------+------+------+------+------+------+------+------+
+ *  31: 24 |ACL_HIT| DP  |  LRN_FLAGS  |           CPUQ            |
+ *         +------+------+------+------+------+------+------+------+
+ *  23: 16 |           CPUQ            |      QOS_CLASS     |TAG_TYPE|
+ *         +------+------+------+------+------+------+------+------+
+ *  15:  8 |         PCP        |  DEI |            VID            |
+ *         +------+------+------+------+------+------+------+------+
+ *   7:  0 |                          VID                          |
+ *         +------+------+------+------+------+------+------+------+
+ */
+
+static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
+				   struct net_device *netdev)
+{
+	struct dsa_port *dp = dsa_slave_to_port(netdev);
+	u64 bypass, dest, src, qos_class;
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+	u8 *injection;
+
+	if (unlikely(skb_cow_head(skb, OCELOT_TAG_LEN) < 0)) {
+		netdev_err(netdev, "Cannot make room for tag.\n");
+		return NULL;
+	}
+
+	injection = skb_push(skb, OCELOT_TAG_LEN);
+
+	memset(injection, 0, OCELOT_TAG_LEN);
+
+	src = dsa_upstream_port(ds, port);
+	dest = BIT(port);
+	bypass = true;
+	qos_class = skb->priority;
+
+	packing(injection, &bypass,   127, 127, OCELOT_TAG_LEN, PACK, 0);
+	packing(injection, &dest,      68,  56, OCELOT_TAG_LEN, PACK, 0);
+	packing(injection, &src,       46,  43, OCELOT_TAG_LEN, PACK, 0);
+	packing(injection, &qos_class, 19,  17, OCELOT_TAG_LEN, PACK, 0);
+
+	return skb;
+}
+
+static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
+				  struct net_device *netdev,
+				  struct packet_type *pt)
+{
+	u64 src_port, qos_class;
+	u8 *start = skb->data;
+	u8 *extraction;
+
+	/* Revert skb->data by the amount consumed by the DSA master,
+	 * so it points to the beginning of the frame.
+	 */
+	skb_push(skb, ETH_HLEN);
+	/* We don't care about the long prefix, it is just for easy entrance
+	 * into the DSA master's RX filter. Discard it now by moving it into
+	 * the headroom.
+	 */
+	skb_pull(skb, OCELOT_LONG_PREFIX_LEN);
+	/* And skb->data now points to the extraction frame header.
+	 * Keep a pointer to it.
+	 */
+	extraction = skb->data;
+	/* Now the EFH is part of the headroom as well */
+	skb_pull(skb, OCELOT_TAG_LEN);
+	/* Reset the pointer to the real MAC header */
+	skb_reset_mac_header(skb);
+	skb_reset_mac_len(skb);
+	/* And move skb->data to the correct location again */
+	skb_pull(skb, ETH_HLEN);
+
+	/* Remove from inet csum the extraction header */
+	skb_postpull_rcsum(skb, start, OCELOT_LONG_PREFIX_LEN + OCELOT_TAG_LEN);
+
+	packing(extraction, &src_port,  46, 43, OCELOT_TAG_LEN, UNPACK, 0);
+	packing(extraction, &qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0);
+
+	skb->dev = dsa_master_find_slave(netdev, 0, src_port);
+	if (!skb->dev)
+		/* The switch will reflect back some frames sent through
+		 * sockets opened on the bare DSA master. These will come back
+		 * with src_port equal to the index of the CPU port, for which
+		 * there is no slave registered. So don't print any error
+		 * message here (ignore and drop those frames).
+		 */
+		return NULL;
+
+	skb->offload_fwd_mark = 1;
+	skb->priority = qos_class;
+
+	return skb;
+}
+
+static struct dsa_device_ops ocelot_netdev_ops = {
+	.name			= "ocelot",
+	.proto			= DSA_TAG_PROTO_OCELOT,
+	.xmit			= ocelot_xmit,
+	.rcv			= ocelot_rcv,
+	.overhead		= OCELOT_TAG_LEN + OCELOT_LONG_PREFIX_LEN,
+};
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT);
+
+module_dsa_tag_driver(ocelot_netdev_ops);
-- 
cgit v1.2.3-59-g8ed1b


From b7b3fc8dd95bc02bd30680da258e09dda55270db Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Fri, 15 Nov 2019 13:37:22 +0100
Subject: bpf: Support doubleword alignment in bpf_jit_binary_alloc

Currently passing alignment greater than 4 to bpf_jit_binary_alloc does
not work: in such cases it silently aligns only to 4 bytes.

On s390, in order to load a constant from memory in a large (>512k) BPF
program, one must use lgrl instruction, whose memory operand must be
aligned on an 8-byte boundary.

This patch makes it possible to request 8-byte alignment from
bpf_jit_binary_alloc, and also makes it issue a warning when an
unsupported alignment is requested.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191115123722.58462-1-iii@linux.ibm.com
---
 include/linux/filter.h | 6 ++++--
 kernel/bpf/core.c      | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7a6f8f6f1da4..ad80e9c6111c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -515,10 +515,12 @@ struct sock_fprog_kern {
 	struct sock_filter	*filter;
 };
 
+/* Some arches need doubleword alignment for their instructions and/or data */
+#define BPF_IMAGE_ALIGNMENT 8
+
 struct bpf_binary_header {
 	u32 pages;
-	/* Some arches need word alignment for their instructions */
-	u8 image[] __aligned(4);
+	u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
 };
 
 struct bpf_prog {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c1fde0303280..99693f3c4e99 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
 #include <linux/rcupdate.h>
 #include <linux/perf_event.h>
 #include <linux/extable.h>
+#include <linux/log2.h>
 #include <asm/unaligned.h>
 
 /* Registers */
@@ -815,6 +816,9 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	struct bpf_binary_header *hdr;
 	u32 size, hole, start, pages;
 
+	WARN_ON_ONCE(!is_power_of_2(alignment) ||
+		     alignment > BPF_IMAGE_ALIGNMENT);
+
 	/* Most of BPF filters are really small, but if some of them
 	 * fill a page, allow at least 128 extra bytes to insert a
 	 * random section of illegal instructions.
-- 
cgit v1.2.3-59-g8ed1b


From 5964b2000f283ff5df366f718e0f083ebbaae977 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 14 Nov 2019 10:57:03 -0800
Subject: bpf: Add bpf_arch_text_poke() helper

Add bpf_arch_text_poke() helper that is used by BPF trampoline logic to patch
nops/calls in kernel text into calls into BPF trampoline and to patch
calls/nops inside BPF programs too.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20191114185720.1641606-4-ast@kernel.org
---
 arch/x86/net/bpf_jit_comp.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/bpf.h         |  8 +++++++
 kernel/bpf/core.c           |  6 ++++++
 3 files changed, 65 insertions(+)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fb99d976ad6e..254b2889e881 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -9,9 +9,11 @@
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
+#include <linux/memory.h>
 #include <asm/extable.h>
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
+#include <asm/text-patching.h>
 
 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -486,6 +488,55 @@ static int emit_call(u8 **pprog, void *func, void *ip)
 	return 0;
 }
 
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+		       void *old_addr, void *new_addr)
+{
+	u8 old_insn[X86_CALL_SIZE] = {};
+	u8 new_insn[X86_CALL_SIZE] = {};
+	u8 *prog;
+	int ret;
+
+	if (!is_kernel_text((long)ip))
+		/* BPF trampoline in modules is not supported */
+		return -EINVAL;
+
+	if (old_addr) {
+		prog = old_insn;
+		ret = emit_call(&prog, old_addr, (void *)ip);
+		if (ret)
+			return ret;
+	}
+	if (new_addr) {
+		prog = new_insn;
+		ret = emit_call(&prog, new_addr, (void *)ip);
+		if (ret)
+			return ret;
+	}
+	ret = -EBUSY;
+	mutex_lock(&text_mutex);
+	switch (t) {
+	case BPF_MOD_NOP_TO_CALL:
+		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
+			goto out;
+		text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
+		break;
+	case BPF_MOD_CALL_TO_CALL:
+		if (memcmp(ip, old_insn, X86_CALL_SIZE))
+			goto out;
+		text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
+		break;
+	case BPF_MOD_CALL_TO_NOP:
+		if (memcmp(ip, old_insn, X86_CALL_SIZE))
+			goto out;
+		text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
+		break;
+	}
+	ret = 0;
+out:
+	mutex_unlock(&text_mutex);
+	return ret;
+}
+
 static bool ex_handler_bpf(const struct exception_table_entry *x,
 			   struct pt_regs *regs, int trapnr,
 			   unsigned long error_code, unsigned long fault_addr)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7c7f518811a6..8b90db25348a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1157,4 +1157,12 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 }
 #endif /* CONFIG_INET */
 
+enum bpf_text_poke_type {
+	BPF_MOD_NOP_TO_CALL,
+	BPF_MOD_CALL_TO_CALL,
+	BPF_MOD_CALL_TO_NOP,
+};
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+		       void *addr1, void *addr2);
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 99693f3c4e99..434a0d920153 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2144,6 +2144,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 	return -EFAULT;
 }
 
+int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+			      void *addr1, void *addr2)
+{
+	return -ENOTSUPP;
+}
+
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
 
-- 
cgit v1.2.3-59-g8ed1b


From fec56f5890d93fc2ed74166c397dc186b1c25951 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 14 Nov 2019 10:57:04 -0800
Subject: bpf: Introduce BPF trampoline

Introduce BPF trampoline concept to allow kernel code to call into BPF programs
with practically zero overhead.  The trampoline generation logic is
architecture dependent.  It's converting native calling convention into BPF
calling convention.  BPF ISA is 64-bit (even on 32-bit architectures). The
registers R1 to R5 are used to pass arguments into BPF functions. The main BPF
program accepts only single argument "ctx" in R1. Whereas CPU native calling
convention is different. x86-64 is passing first 6 arguments in registers
and the rest on the stack. x86-32 is passing first 3 arguments in registers.
sparc64 is passing first 6 in registers. And so on.

The trampolines between BPF and kernel already exist.  BPF_CALL_x macros in
include/linux/filter.h statically compile trampolines from BPF into kernel
helpers. They convert up to five u64 arguments into kernel C pointers and
integers. On 64-bit architectures this BPF_to_kernel trampolines are nops. On
32-bit architecture they're meaningful.

The opposite job kernel_to_BPF trampolines is done by CAST_TO_U64 macros and
__bpf_trace_##call() shim functions in include/trace/bpf_probe.h. They convert
kernel function arguments into array of u64s that BPF program consumes via
R1=ctx pointer.

This patch set is doing the same job as __bpf_trace_##call() static
trampolines, but dynamically for any kernel function. There are ~22k global
kernel functions that are attachable via nop at function entry. The function
arguments and types are described in BTF.  The job of btf_distill_func_proto()
function is to extract useful information from BTF into "function model" that
architecture dependent trampoline generators will use to generate assembly code
to cast kernel function arguments into array of u64s.  For example the kernel
function eth_type_trans has two pointers. They will be casted to u64 and stored
into stack of generated trampoline. The pointer to that stack space will be
passed into BPF program in R1. On x86-64 such generated trampoline will consume
16 bytes of stack and two stores of %rdi and %rsi into stack. The verifier will
make sure that only two u64 are accessed read-only by BPF program. The verifier
will also recognize the precise type of the pointers being accessed and will
not allow typecasting of the pointer to a different type within BPF program.

The tracing use case in the datacenter demonstrated that certain key kernel
functions have (like tcp_retransmit_skb) have 2 or more kprobes that are always
active.  Other functions have both kprobe and kretprobe.  So it is essential to
keep both kernel code and BPF programs executing at maximum speed. Hence
generated BPF trampoline is re-generated every time new program is attached or
detached to maintain maximum performance.

To avoid the high cost of retpoline the attached BPF programs are called
directly. __bpf_prog_enter/exit() are used to support per-program execution
stats.  In the future this logic will be optimized further by adding support
for bpf_stats_enabled_key inside generated assembly code. Introduction of
preemptible and sleepable BPF programs will completely remove the need to call
to __bpf_prog_enter/exit().

Detach of a BPF program from the trampoline should not fail. To avoid memory
allocation in detach path the half of the page is used as a reserve and flipped
after each attach/detach. 2k bytes is enough to call 40+ BPF programs directly
which is enough for BPF tracing use cases. This limit can be increased in the
future.

BPF_TRACE_FENTRY programs have access to raw kernel function arguments while
BPF_TRACE_FEXIT programs have access to kernel return value as well. Often
kprobe BPF program remembers function arguments in a map while kretprobe
fetches arguments from a map and analyzes them together with return value.
BPF_TRACE_FEXIT accelerates this typical use case.

Recursion prevention for kprobe BPF programs is done via per-cpu
bpf_prog_active counter. In practice that turned out to be a mistake. It
caused programs to randomly skip execution. The tracing tools missed results
they were looking for. Hence BPF trampoline doesn't provide builtin recursion
prevention. It's a job of BPF program itself and will be addressed in the
follow up patches.

BPF trampoline is intended to be used beyond tracing and fentry/fexit use cases
in the future. For example to remove retpoline cost from XDP programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org
---
 arch/x86/net/bpf_jit_comp.c | 211 +++++++++++++++++++++++++++++++++++-
 include/linux/bpf.h         | 105 ++++++++++++++++++
 include/uapi/linux/bpf.h    |   2 +
 kernel/bpf/Makefile         |   1 +
 kernel/bpf/btf.c            |  77 +++++++++++++-
 kernel/bpf/core.c           |   1 +
 kernel/bpf/syscall.c        |  53 +++++++++-
 kernel/bpf/trampoline.c     | 253 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c       |  42 ++++++++
 9 files changed, 735 insertions(+), 10 deletions(-)
 create mode 100644 kernel/bpf/trampoline.c

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 254b2889e881..be2b43a894f6 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -98,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 
 /* Pick a register outside of BPF range for JIT internal work */
 #define AUX_REG (MAX_BPF_JIT_REG + 1)
+#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
 
 /*
  * The following table maps BPF registers to x86-64 registers.
@@ -106,8 +107,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
  * register in load/store instructions, it always needs an
  * extra byte of encoding and is callee saved.
  *
- * Also x86-64 register R9 is unused. x86-64 register R10 is
- * used for blinding (if enabled).
+ * x86-64 register R9 is not used by BPF programs, but can be used by BPF
+ * trampoline. x86-64 register R10 is used for blinding (if enabled).
  */
 static const int reg2hex[] = {
 	[BPF_REG_0] = 0,  /* RAX */
@@ -123,6 +124,7 @@ static const int reg2hex[] = {
 	[BPF_REG_FP] = 5, /* RBP readonly */
 	[BPF_REG_AX] = 2, /* R10 temp register */
 	[AUX_REG] = 3,    /* R11 temp register */
+	[X86_REG_R9] = 1, /* R9 register, 6th function argument */
 };
 
 static const int reg2pt_regs[] = {
@@ -150,6 +152,7 @@ static bool is_ereg(u32 reg)
 			     BIT(BPF_REG_7) |
 			     BIT(BPF_REG_8) |
 			     BIT(BPF_REG_9) |
+			     BIT(X86_REG_R9) |
 			     BIT(BPF_REG_AX));
 }
 
@@ -1233,6 +1236,210 @@ emit_jmp:
 	return proglen;
 }
 
+static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
+		      int stack_size)
+{
+	int i;
+	/* Store function arguments to stack.
+	 * For a function that accepts two pointers the sequence will be:
+	 * mov QWORD PTR [rbp-0x10],rdi
+	 * mov QWORD PTR [rbp-0x8],rsi
+	 */
+	for (i = 0; i < min(nr_args, 6); i++)
+		emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]),
+			 BPF_REG_FP,
+			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+			 -(stack_size - i * 8));
+}
+
+static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
+			 int stack_size)
+{
+	int i;
+
+	/* Restore function arguments from stack.
+	 * For a function that accepts two pointers the sequence will be:
+	 * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
+	 * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
+	 */
+	for (i = 0; i < min(nr_args, 6); i++)
+		emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]),
+			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+			 BPF_REG_FP,
+			 -(stack_size - i * 8));
+}
+
+static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
+		      struct bpf_prog **progs, int prog_cnt, int stack_size)
+{
+	u8 *prog = *pprog;
+	int cnt = 0, i;
+
+	for (i = 0; i < prog_cnt; i++) {
+		if (emit_call(&prog, __bpf_prog_enter, prog))
+			return -EINVAL;
+		/* remember prog start time returned by __bpf_prog_enter */
+		emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+
+		/* arg1: lea rdi, [rbp - stack_size] */
+		EMIT4(0x48, 0x8D, 0x7D, -stack_size);
+		/* arg2: progs[i]->insnsi for interpreter */
+		if (!progs[i]->jited)
+			emit_mov_imm64(&prog, BPF_REG_2,
+				       (long) progs[i]->insnsi >> 32,
+				       (u32) (long) progs[i]->insnsi);
+		/* call JITed bpf program or interpreter */
+		if (emit_call(&prog, progs[i]->bpf_func, prog))
+			return -EINVAL;
+
+		/* arg1: mov rdi, progs[i] */
+		emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32,
+			       (u32) (long) progs[i]);
+		/* arg2: mov rsi, rbx <- start time in nsec */
+		emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
+		if (emit_call(&prog, __bpf_prog_exit, prog))
+			return -EINVAL;
+	}
+	*pprog = prog;
+	return 0;
+}
+
+/* Example:
+ * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
+ * its 'struct btf_func_model' will be nr_args=2
+ * The assembly code when eth_type_trans is executing after trampoline:
+ *
+ * push rbp
+ * mov rbp, rsp
+ * sub rsp, 16                     // space for skb and dev
+ * push rbx                        // temp regs to pass start time
+ * mov qword ptr [rbp - 16], rdi   // save skb pointer to stack
+ * mov qword ptr [rbp - 8], rsi    // save dev pointer to stack
+ * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
+ * mov rbx, rax                    // remember start time in bpf stats are enabled
+ * lea rdi, [rbp - 16]             // R1==ctx of bpf prog
+ * call addr_of_jited_FENTRY_prog
+ * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
+ * mov rsi, rbx                    // prog start time
+ * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
+ * mov rdi, qword ptr [rbp - 16]   // restore skb pointer from stack
+ * mov rsi, qword ptr [rbp - 8]    // restore dev pointer from stack
+ * pop rbx
+ * leave
+ * ret
+ *
+ * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
+ * replaced with 'call generated_bpf_trampoline'. When it returns
+ * eth_type_trans will continue executing with original skb and dev pointers.
+ *
+ * The assembly code when eth_type_trans is called from trampoline:
+ *
+ * push rbp
+ * mov rbp, rsp
+ * sub rsp, 24                     // space for skb, dev, return value
+ * push rbx                        // temp regs to pass start time
+ * mov qword ptr [rbp - 24], rdi   // save skb pointer to stack
+ * mov qword ptr [rbp - 16], rsi   // save dev pointer to stack
+ * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
+ * mov rbx, rax                    // remember start time if bpf stats are enabled
+ * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
+ * call addr_of_jited_FENTRY_prog  // bpf prog can access skb and dev
+ * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
+ * mov rsi, rbx                    // prog start time
+ * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
+ * mov rdi, qword ptr [rbp - 24]   // restore skb pointer from stack
+ * mov rsi, qword ptr [rbp - 16]   // restore dev pointer from stack
+ * call eth_type_trans+5           // execute body of eth_type_trans
+ * mov qword ptr [rbp - 8], rax    // save return value
+ * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
+ * mov rbx, rax                    // remember start time in bpf stats are enabled
+ * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
+ * call addr_of_jited_FEXIT_prog   // bpf prog can access skb, dev, return value
+ * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
+ * mov rsi, rbx                    // prog start time
+ * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
+ * mov rax, qword ptr [rbp - 8]    // restore eth_type_trans's return value
+ * pop rbx
+ * leave
+ * add rsp, 8                      // skip eth_type_trans's frame
+ * ret                             // return to its caller
+ */
+int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
+				struct bpf_prog **fentry_progs, int fentry_cnt,
+				struct bpf_prog **fexit_progs, int fexit_cnt,
+				void *orig_call)
+{
+	int cnt = 0, nr_args = m->nr_args;
+	int stack_size = nr_args * 8;
+	u8 *prog;
+
+	/* x86-64 supports up to 6 arguments. 7+ can be added in the future */
+	if (nr_args > 6)
+		return -ENOTSUPP;
+
+	if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
+	    (flags & BPF_TRAMP_F_SKIP_FRAME))
+		return -EINVAL;
+
+	if (flags & BPF_TRAMP_F_CALL_ORIG)
+		stack_size += 8; /* room for return value of orig_call */
+
+	if (flags & BPF_TRAMP_F_SKIP_FRAME)
+		/* skip patched call instruction and point orig_call to actual
+		 * body of the kernel function.
+		 */
+		orig_call += X86_CALL_SIZE;
+
+	prog = image;
+
+	EMIT1(0x55);		 /* push rbp */
+	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+	EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
+	EMIT1(0x53);		 /* push rbx */
+
+	save_regs(m, &prog, nr_args, stack_size);
+
+	if (fentry_cnt)
+		if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size))
+			return -EINVAL;
+
+	if (flags & BPF_TRAMP_F_CALL_ORIG) {
+		if (fentry_cnt)
+			restore_regs(m, &prog, nr_args, stack_size);
+
+		/* call original function */
+		if (emit_call(&prog, orig_call, prog))
+			return -EINVAL;
+		/* remember return value in a stack for bpf prog to access */
+		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
+	}
+
+	if (fexit_cnt)
+		if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size))
+			return -EINVAL;
+
+	if (flags & BPF_TRAMP_F_RESTORE_REGS)
+		restore_regs(m, &prog, nr_args, stack_size);
+
+	if (flags & BPF_TRAMP_F_CALL_ORIG)
+		/* restore original return value back into RAX */
+		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
+
+	EMIT1(0x5B); /* pop rbx */
+	EMIT1(0xC9); /* leave */
+	if (flags & BPF_TRAMP_F_SKIP_FRAME)
+		/* skip our return address and return to parent */
+		EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
+	EMIT1(0xC3); /* ret */
+	/* One half of the page has active running trampoline.
+	 * Another half is an area for next trampoline.
+	 * Make sure the trampoline generation logic doesn't overflow.
+	 */
+	if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY))
+		return -EFAULT;
+	return 0;
+}
+
 struct x64_jit_data {
 	struct bpf_binary_header *header;
 	int *addrs;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8b90db25348a..0d4c5c224d79 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -14,6 +14,8 @@
 #include <linux/numa.h>
 #include <linux/wait.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/refcount.h>
+#include <linux/mutex.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -384,6 +386,100 @@ struct bpf_prog_stats {
 	struct u64_stats_sync syncp;
 } __aligned(2 * sizeof(u64));
 
+struct btf_func_model {
+	u8 ret_size;
+	u8 nr_args;
+	u8 arg_size[MAX_BPF_FUNC_ARGS];
+};
+
+/* Restore arguments before returning from trampoline to let original function
+ * continue executing. This flag is used for fentry progs when there are no
+ * fexit progs.
+ */
+#define BPF_TRAMP_F_RESTORE_REGS	BIT(0)
+/* Call original function after fentry progs, but before fexit progs.
+ * Makes sense for fentry/fexit, normal calls and indirect calls.
+ */
+#define BPF_TRAMP_F_CALL_ORIG		BIT(1)
+/* Skip current frame and return to parent.  Makes sense for fentry/fexit
+ * programs only. Should not be used with normal calls and indirect calls.
+ */
+#define BPF_TRAMP_F_SKIP_FRAME		BIT(2)
+
+/* Different use cases for BPF trampoline:
+ * 1. replace nop at the function entry (kprobe equivalent)
+ *    flags = BPF_TRAMP_F_RESTORE_REGS
+ *    fentry = a set of programs to run before returning from trampoline
+ *
+ * 2. replace nop at the function entry (kprobe + kretprobe equivalent)
+ *    flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME
+ *    orig_call = fentry_ip + MCOUNT_INSN_SIZE
+ *    fentry = a set of program to run before calling original function
+ *    fexit = a set of program to run after original function
+ *
+ * 3. replace direct call instruction anywhere in the function body
+ *    or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid)
+ *    With flags = 0
+ *      fentry = a set of programs to run before returning from trampoline
+ *    With flags = BPF_TRAMP_F_CALL_ORIG
+ *      orig_call = original callback addr or direct function addr
+ *      fentry = a set of program to run before calling original function
+ *      fexit = a set of program to run after original function
+ */
+int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
+				struct bpf_prog **fentry_progs, int fentry_cnt,
+				struct bpf_prog **fexit_progs, int fexit_cnt,
+				void *orig_call);
+/* these two functions are called from generated trampoline */
+u64 notrace __bpf_prog_enter(void);
+void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
+
+enum bpf_tramp_prog_type {
+	BPF_TRAMP_FENTRY,
+	BPF_TRAMP_FEXIT,
+	BPF_TRAMP_MAX
+};
+
+struct bpf_trampoline {
+	/* hlist for trampoline_table */
+	struct hlist_node hlist;
+	/* serializes access to fields of this trampoline */
+	struct mutex mutex;
+	refcount_t refcnt;
+	u64 key;
+	struct {
+		struct btf_func_model model;
+		void *addr;
+	} func;
+	/* list of BPF programs using this trampoline */
+	struct hlist_head progs_hlist[BPF_TRAMP_MAX];
+	/* Number of attached programs. A counter per kind. */
+	int progs_cnt[BPF_TRAMP_MAX];
+	/* Executable image of trampoline */
+	void *image;
+	u64 selector;
+};
+#ifdef CONFIG_BPF_JIT
+struct bpf_trampoline *bpf_trampoline_lookup(u64 key);
+int bpf_trampoline_link_prog(struct bpf_prog *prog);
+int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
+void bpf_trampoline_put(struct bpf_trampoline *tr);
+#else
+static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
+{
+	return NULL;
+}
+static inline int bpf_trampoline_link_prog(struct bpf_prog *prog)
+{
+	return -ENOTSUPP;
+}
+static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
+{
+	return -ENOTSUPP;
+}
+static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
+#endif
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -398,6 +494,9 @@ struct bpf_prog_aux {
 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
 	bool offload_requested;
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
+	enum bpf_tramp_prog_type trampoline_prog_type;
+	struct bpf_trampoline *trampoline;
+	struct hlist_node tramp_hlist;
 	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
 	const struct btf_type *attach_func_proto;
 	/* function name for valid attach_btf_id */
@@ -784,6 +883,12 @@ int btf_struct_access(struct bpf_verifier_log *log,
 		      u32 *next_btf_id);
 u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int);
 
+int btf_distill_func_proto(struct bpf_verifier_log *log,
+			   struct btf *btf,
+			   const struct btf_type *func_proto,
+			   const char *func_name,
+			   struct btf_func_model *m);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index df6809a76404..69c200e6e696 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -201,6 +201,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_GETSOCKOPT,
 	BPF_CGROUP_SETSOCKOPT,
 	BPF_TRACE_RAW_TP,
+	BPF_TRACE_FENTRY,
+	BPF_TRACE_FEXIT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1d9adb212f9..3f671bf617e8 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4639c4ba9a9b..9e1164e5b429 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3517,13 +3517,18 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		args++;
 		nr_args--;
 	}
-	if (arg >= nr_args) {
+
+	if (prog->expected_attach_type == BPF_TRACE_FEXIT &&
+	    arg == nr_args) {
+		/* function return type */
+		t = btf_type_by_id(btf_vmlinux, t->type);
+	} else if (arg >= nr_args) {
 		bpf_log(log, "func '%s' doesn't have %d-th argument\n",
-			tname, arg);
+			tname, arg + 1);
 		return false;
+	} else {
+		t = btf_type_by_id(btf_vmlinux, args[arg].type);
 	}
-
-	t = btf_type_by_id(btf_vmlinux, args[arg].type);
 	/* skip modifiers */
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf_vmlinux, t->type);
@@ -3784,6 +3789,70 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg)
 	return btf_id;
 }
 
+static int __get_type_size(struct btf *btf, u32 btf_id,
+			   const struct btf_type **bad_type)
+{
+	const struct btf_type *t;
+
+	if (!btf_id)
+		/* void */
+		return 0;
+	t = btf_type_by_id(btf, btf_id);
+	while (t && btf_type_is_modifier(t))
+		t = btf_type_by_id(btf, t->type);
+	if (!t)
+		return -EINVAL;
+	if (btf_type_is_ptr(t))
+		/* kernel size of pointer. Not BPF's size of pointer*/
+		return sizeof(void *);
+	if (btf_type_is_int(t) || btf_type_is_enum(t))
+		return t->size;
+	*bad_type = t;
+	return -EINVAL;
+}
+
+int btf_distill_func_proto(struct bpf_verifier_log *log,
+			   struct btf *btf,
+			   const struct btf_type *func,
+			   const char *tname,
+			   struct btf_func_model *m)
+{
+	const struct btf_param *args;
+	const struct btf_type *t;
+	u32 i, nargs;
+	int ret;
+
+	args = (const struct btf_param *)(func + 1);
+	nargs = btf_type_vlen(func);
+	if (nargs >= MAX_BPF_FUNC_ARGS) {
+		bpf_log(log,
+			"The function %s has %d arguments. Too many.\n",
+			tname, nargs);
+		return -EINVAL;
+	}
+	ret = __get_type_size(btf, func->type, &t);
+	if (ret < 0) {
+		bpf_log(log,
+			"The function %s return type %s is unsupported.\n",
+			tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
+		return -EINVAL;
+	}
+	m->ret_size = ret;
+
+	for (i = 0; i < nargs; i++) {
+		ret = __get_type_size(btf, args[i].type, &t);
+		if (ret < 0) {
+			bpf_log(log,
+				"The function %s arg%d type %s is unsupported.\n",
+				tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+			return -EINVAL;
+		}
+		m->arg_size[i] = ret;
+	}
+	m->nr_args = nargs;
+	return 0;
+}
+
 void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 434a0d920153..da5a8b8e278f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2015,6 +2015,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	if (aux->prog->has_callchain_buf)
 		put_callchain_buffers();
 #endif
+	bpf_trampoline_put(aux->trampoline);
 	for (i = 0; i < aux->func_cnt; i++)
 		bpf_jit_free(aux->func[i]);
 	if (aux->func_cnt) {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6d9ce95e5a8d..e2e37bea86bc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1799,6 +1799,49 @@ static int bpf_obj_get(const union bpf_attr *attr)
 				attr->file_flags);
 }
 
+static int bpf_tracing_prog_release(struct inode *inode, struct file *filp)
+{
+	struct bpf_prog *prog = filp->private_data;
+
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
+	bpf_prog_put(prog);
+	return 0;
+}
+
+static const struct file_operations bpf_tracing_prog_fops = {
+	.release	= bpf_tracing_prog_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
+};
+
+static int bpf_tracing_prog_attach(struct bpf_prog *prog)
+{
+	int tr_fd, err;
+
+	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
+	    prog->expected_attach_type != BPF_TRACE_FEXIT) {
+		err = -EINVAL;
+		goto out_put_prog;
+	}
+
+	err = bpf_trampoline_link_prog(prog);
+	if (err)
+		goto out_put_prog;
+
+	tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops,
+				 prog, O_CLOEXEC);
+	if (tr_fd < 0) {
+		WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
+		err = tr_fd;
+		goto out_put_prog;
+	}
+	return tr_fd;
+
+out_put_prog:
+	bpf_prog_put(prog);
+	return err;
+}
+
 struct bpf_raw_tracepoint {
 	struct bpf_raw_event_map *btp;
 	struct bpf_prog *prog;
@@ -1850,14 +1893,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 
 	if (prog->type == BPF_PROG_TYPE_TRACING) {
 		if (attr->raw_tracepoint.name) {
-			/* raw_tp name should not be specified in raw_tp
-			 * programs that were verified via in-kernel BTF info
+			/* The attach point for this category of programs
+			 * should be specified via btf_id during program load.
 			 */
 			err = -EINVAL;
 			goto out_put_prog;
 		}
-		/* raw_tp name is taken from type name instead */
-		tp_name = prog->aux->attach_func_name;
+		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
+			tp_name = prog->aux->attach_func_name;
+		else
+			return bpf_tracing_prog_attach(prog);
 	} else {
 		if (strncpy_from_user(buf,
 				      u64_to_user_ptr(attr->raw_tracepoint.name),
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
new file mode 100644
index 000000000000..10ae59d65f13
--- /dev/null
+++ b/kernel/bpf/trampoline.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
+#define TRAMPOLINE_HASH_BITS 10
+#define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
+
+static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
+
+/* serializes access to trampoline_table */
+static DEFINE_MUTEX(trampoline_mutex);
+
+struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
+{
+	struct bpf_trampoline *tr;
+	struct hlist_head *head;
+	void *image;
+	int i;
+
+	mutex_lock(&trampoline_mutex);
+	head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
+	hlist_for_each_entry(tr, head, hlist) {
+		if (tr->key == key) {
+			refcount_inc(&tr->refcnt);
+			goto out;
+		}
+	}
+	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+	if (!tr)
+		goto out;
+
+	/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
+	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	if (!image) {
+		kfree(tr);
+		tr = NULL;
+		goto out;
+	}
+
+	tr->key = key;
+	INIT_HLIST_NODE(&tr->hlist);
+	hlist_add_head(&tr->hlist, head);
+	refcount_set(&tr->refcnt, 1);
+	mutex_init(&tr->mutex);
+	for (i = 0; i < BPF_TRAMP_MAX; i++)
+		INIT_HLIST_HEAD(&tr->progs_hlist[i]);
+
+	set_vm_flush_reset_perms(image);
+	/* Keep image as writeable. The alternative is to keep flipping ro/rw
+	 * everytime new program is attached or detached.
+	 */
+	set_memory_x((long)image, 1);
+	tr->image = image;
+out:
+	mutex_unlock(&trampoline_mutex);
+	return tr;
+}
+
+/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
+ * bytes on x86.  Pick a number to fit into PAGE_SIZE / 2
+ */
+#define BPF_MAX_TRAMP_PROGS 40
+
+static int bpf_trampoline_update(struct bpf_trampoline *tr)
+{
+	void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
+	void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+	struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
+	int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
+	int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
+	struct bpf_prog **progs, **fentry, **fexit;
+	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+	struct bpf_prog_aux *aux;
+	int err;
+
+	if (fentry_cnt + fexit_cnt == 0) {
+		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP,
+					 old_image, NULL);
+		tr->selector = 0;
+		goto out;
+	}
+
+	/* populate fentry progs */
+	fentry = progs = progs_to_run;
+	hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist)
+		*progs++ = aux->prog;
+
+	/* populate fexit progs */
+	fexit = progs;
+	hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist)
+		*progs++ = aux->prog;
+
+	if (fexit_cnt)
+		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+
+	err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags,
+					  fentry, fentry_cnt,
+					  fexit, fexit_cnt,
+					  tr->func.addr);
+	if (err)
+		goto out;
+
+	if (tr->selector)
+		/* progs already running at this address */
+		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL,
+					 old_image, new_image);
+	else
+		/* first time registering */
+		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL,
+					 NULL, new_image);
+	if (err)
+		goto out;
+	tr->selector++;
+out:
+	return err;
+}
+
+static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
+{
+	switch (t) {
+	case BPF_TRACE_FENTRY:
+		return BPF_TRAMP_FENTRY;
+	default:
+		return BPF_TRAMP_FEXIT;
+	}
+}
+
+int bpf_trampoline_link_prog(struct bpf_prog *prog)
+{
+	enum bpf_tramp_prog_type kind;
+	struct bpf_trampoline *tr;
+	int err = 0;
+
+	tr = prog->aux->trampoline;
+	kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
+	mutex_lock(&tr->mutex);
+	if (tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]
+	    >= BPF_MAX_TRAMP_PROGS) {
+		err = -E2BIG;
+		goto out;
+	}
+	if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
+		/* prog already linked */
+		err = -EBUSY;
+		goto out;
+	}
+	hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
+	tr->progs_cnt[kind]++;
+	err = bpf_trampoline_update(prog->aux->trampoline);
+	if (err) {
+		hlist_del(&prog->aux->tramp_hlist);
+		tr->progs_cnt[kind]--;
+	}
+out:
+	mutex_unlock(&tr->mutex);
+	return err;
+}
+
+/* bpf_trampoline_unlink_prog() should never fail. */
+int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
+{
+	enum bpf_tramp_prog_type kind;
+	struct bpf_trampoline *tr;
+	int err;
+
+	tr = prog->aux->trampoline;
+	kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
+	mutex_lock(&tr->mutex);
+	hlist_del(&prog->aux->tramp_hlist);
+	tr->progs_cnt[kind]--;
+	err = bpf_trampoline_update(prog->aux->trampoline);
+	mutex_unlock(&tr->mutex);
+	return err;
+}
+
+void bpf_trampoline_put(struct bpf_trampoline *tr)
+{
+	if (!tr)
+		return;
+	mutex_lock(&trampoline_mutex);
+	if (!refcount_dec_and_test(&tr->refcnt))
+		goto out;
+	WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
+	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
+		goto out;
+	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
+		goto out;
+	bpf_jit_free_exec(tr->image);
+	hlist_del(&tr->hlist);
+	kfree(tr);
+out:
+	mutex_unlock(&trampoline_mutex);
+}
+
+/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that
+ * are needed for trampoline. The macro is split into
+ * call _bpf_prog_enter
+ * call prog->bpf_func
+ * call __bpf_prog_exit
+ */
+u64 notrace __bpf_prog_enter(void)
+{
+	u64 start = 0;
+
+	rcu_read_lock();
+	preempt_disable();
+	if (static_branch_unlikely(&bpf_stats_enabled_key))
+		start = sched_clock();
+	return start;
+}
+
+void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
+{
+	struct bpf_prog_stats *stats;
+
+	if (static_branch_unlikely(&bpf_stats_enabled_key) &&
+	    /* static_key could be enabled in __bpf_prog_enter
+	     * and disabled in __bpf_prog_exit.
+	     * And vice versa.
+	     * Hence check that 'start' is not zero.
+	     */
+	    start) {
+		stats = this_cpu_ptr(prog->aux->stats);
+		u64_stats_update_begin(&stats->syncp);
+		stats->cnt++;
+		stats->nsecs += sched_clock() - start;
+		u64_stats_update_end(&stats->syncp);
+	}
+	preempt_enable();
+	rcu_read_unlock();
+}
+
+int __weak
+arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
+			    struct bpf_prog **fentry_progs, int fentry_cnt,
+			    struct bpf_prog **fexit_progs, int fexit_cnt,
+			    void *orig_call)
+{
+	return -ENOTSUPP;
+}
+
+static int __init init_trampolines(void)
+{
+	int i;
+
+	for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
+		INIT_HLIST_HEAD(&trampoline_table[i]);
+	return 0;
+}
+late_initcall(init_trampolines);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2f2374967b36..8f89cfa93e88 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9382,8 +9382,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	struct bpf_prog *prog = env->prog;
 	u32 btf_id = prog->aux->attach_btf_id;
 	const char prefix[] = "btf_trace_";
+	struct bpf_trampoline *tr;
 	const struct btf_type *t;
 	const char *tname;
+	int ret = 0;
+	long addr;
 
 	if (prog->type != BPF_PROG_TYPE_TRACING)
 		return 0;
@@ -9432,6 +9435,45 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		prog->aux->attach_func_proto = t;
 		prog->aux->attach_btf_trace = true;
 		return 0;
+	case BPF_TRACE_FENTRY:
+	case BPF_TRACE_FEXIT:
+		if (!btf_type_is_func(t)) {
+			verbose(env, "attach_btf_id %u is not a function\n",
+				btf_id);
+			return -EINVAL;
+		}
+		t = btf_type_by_id(btf_vmlinux, t->type);
+		if (!btf_type_is_func_proto(t))
+			return -EINVAL;
+		tr = bpf_trampoline_lookup(btf_id);
+		if (!tr)
+			return -ENOMEM;
+		prog->aux->attach_func_name = tname;
+		prog->aux->attach_func_proto = t;
+		mutex_lock(&tr->mutex);
+		if (tr->func.addr) {
+			prog->aux->trampoline = tr;
+			goto out;
+		}
+		ret = btf_distill_func_proto(&env->log, btf_vmlinux, t,
+					     tname, &tr->func.model);
+		if (ret < 0)
+			goto out;
+		addr = kallsyms_lookup_name(tname);
+		if (!addr) {
+			verbose(env,
+				"The address of function %s cannot be found\n",
+				tname);
+			ret = -ENOENT;
+			goto out;
+		}
+		tr->func.addr = (void *)addr;
+		prog->aux->trampoline = tr;
+out:
+		mutex_unlock(&tr->mutex);
+		if (ret)
+			bpf_trampoline_put(tr);
+		return ret;
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 9cc31b3a092d9bf2a18f09ad77e727ddb42a5b1e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 14 Nov 2019 10:57:14 -0800
Subject: bpf: Fix race in btf_resolve_helper_id()

btf_resolve_helper_id() caching logic is a bit racy, since under root the
verifier can verify several programs in parallel. Fix it with READ/WRITE_ONCE.
Fix the type as well, since error is also recorded.

Fixes: a7658e1a4164 ("bpf: Check types of arguments passed into helpers")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20191114185720.1641606-15-ast@kernel.org
---
 include/linux/bpf.h   |  5 +++--
 kernel/bpf/btf.c      | 26 +++++++++++++++++++++++++-
 kernel/bpf/verifier.c |  8 +++-----
 net/core/filter.c     |  2 +-
 4 files changed, 32 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0d4c5c224d79..cb5a356381f5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -248,7 +248,7 @@ struct bpf_func_proto {
 		};
 		enum bpf_arg_type arg_type[5];
 	};
-	u32 *btf_id; /* BTF ids of arguments */
+	int *btf_id; /* BTF ids of arguments */
 };
 
 /* bpf_context is intentionally undefined structure. Pointer to bpf_context is
@@ -881,7 +881,8 @@ int btf_struct_access(struct bpf_verifier_log *log,
 		      const struct btf_type *t, int off, int size,
 		      enum bpf_access_type atype,
 		      u32 *next_btf_id);
-u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int);
+int btf_resolve_helper_id(struct bpf_verifier_log *log,
+			  const struct bpf_func_proto *fn, int);
 
 int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   struct btf *btf,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9e1164e5b429..033d071eb59c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3721,7 +3721,8 @@ again:
 	return -EINVAL;
 }
 
-u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg)
+static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn,
+				   int arg)
 {
 	char fnname[KSYM_SYMBOL_LEN + 4] = "btf_";
 	const struct btf_param *args;
@@ -3789,6 +3790,29 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg)
 	return btf_id;
 }
 
+int btf_resolve_helper_id(struct bpf_verifier_log *log,
+			  const struct bpf_func_proto *fn, int arg)
+{
+	int *btf_id = &fn->btf_id[arg];
+	int ret;
+
+	if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID)
+		return -EINVAL;
+
+	ret = READ_ONCE(*btf_id);
+	if (ret)
+		return ret;
+	/* ok to race the search. The result is the same */
+	ret = __btf_resolve_helper_id(log, fn->func, arg);
+	if (!ret) {
+		/* Function argument cannot be type 'void' */
+		bpf_log(log, "BTF resolution bug\n");
+		return -EFAULT;
+	}
+	WRITE_ONCE(*btf_id, ret);
+	return ret;
+}
+
 static int __get_type_size(struct btf *btf, u32 btf_id,
 			   const struct btf_type **bad_type)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8f89cfa93e88..e78ec7990767 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4147,11 +4147,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	meta.func_id = func_id;
 	/* check args */
 	for (i = 0; i < 5; i++) {
-		if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID) {
-			if (!fn->btf_id[i])
-				fn->btf_id[i] = btf_resolve_helper_id(&env->log, fn->func, i);
-			meta.btf_id = fn->btf_id[i];
-		}
+		err = btf_resolve_helper_id(&env->log, fn, i);
+		if (err > 0)
+			meta.btf_id = err;
 		err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta);
 		if (err)
 			return err;
diff --git a/net/core/filter.c b/net/core/filter.c
index fc303abec8fa..f72face90659 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3816,7 +3816,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
-static u32 bpf_skb_output_btf_ids[5];
+static int bpf_skb_output_btf_ids[5];
 const struct bpf_func_proto bpf_skb_output_proto = {
 	.func		= bpf_skb_event_output,
 	.gpl_only	= true,
-- 
cgit v1.2.3-59-g8ed1b


From 5c27d8d76ce810c6254cf5917a6019d824f34bd2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 13 Nov 2019 14:08:01 +0100
Subject: netfilter: nf_flow_table_offload: add IPv6 support

Add nf_flow_rule_route_ipv6() and use it from the IPv6 and the inet
flowtable type definitions. Rename the nf_flow_rule_route() function to
nf_flow_rule_route_ipv4().

Adjust maximum number of actions, which now becomes 16 to leave
sufficient room for the IPv6 address mangling for NAT.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |   9 ++-
 net/ipv4/netfilter/nf_flow_table_ipv4.c |   2 +-
 net/ipv6/netfilter/nf_flow_table_ipv6.c |   2 +-
 net/netfilter/nf_flow_table_inet.c      |  25 +++++++-
 net/netfilter/nf_flow_table_offload.c   | 100 ++++++++++++++++++++++++++++++--
 5 files changed, 127 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index eea66de328d3..f0897b3c97fb 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -163,9 +163,12 @@ void nf_flow_table_offload_flush(struct nf_flowtable *flowtable);
 int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
 				struct net_device *dev,
 				enum flow_block_command cmd);
-int nf_flow_rule_route(struct net *net, const struct flow_offload *flow,
-		       enum flow_offload_tuple_dir dir,
-		       struct nf_flow_rule *flow_rule);
+int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
+			    enum flow_offload_tuple_dir dir,
+			    struct nf_flow_rule *flow_rule);
+int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
+			    enum flow_offload_tuple_dir dir,
+			    struct nf_flow_rule *flow_rule);
 
 int nf_flow_table_offload_init(void);
 void nf_flow_table_offload_exit(void);
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 168b72e18be0..e32e41b99f0f 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -10,7 +10,7 @@ static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.init		= nf_flow_table_init,
 	.setup		= nf_flow_table_offload_setup,
-	.action		= nf_flow_rule_route,
+	.action		= nf_flow_rule_route_ipv4,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ip_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index f069bc0dc056..a8566ee12e83 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -11,7 +11,7 @@ static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.init		= nf_flow_table_init,
 	.setup		= nf_flow_table_offload_setup,
-	.action		= nf_flow_rule_route,
+	.action		= nf_flow_rule_route_ipv6,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ipv6_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index bfb910b874ce..88bedf1ff1ae 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -21,11 +21,34 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
 	return NF_ACCEPT;
 }
 
+static int nf_flow_rule_route_inet(struct net *net,
+				   const struct flow_offload *flow,
+				   enum flow_offload_tuple_dir dir,
+				   struct nf_flow_rule *flow_rule)
+{
+	const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
+	int err;
+
+	switch (flow_tuple->l3proto) {
+	case NFPROTO_IPV4:
+		err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule);
+		break;
+	case NFPROTO_IPV6:
+		err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule);
+		break;
+	default:
+		err = -1;
+		break;
+	}
+
+	return err;
+}
+
 static struct nf_flowtable_type flowtable_inet = {
 	.family		= NFPROTO_INET,
 	.init		= nf_flow_table_init,
 	.setup		= nf_flow_table_offload_setup,
-	.action		= nf_flow_rule_route,
+	.action		= nf_flow_rule_route_inet,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_inet_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index b9f669c80713..a14932748bcf 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -236,6 +236,71 @@ static void flow_offload_ipv4_dnat(struct net *net,
 			    (u8 *)&addr, (u8 *)&mask);
 }
 
+static void flow_offload_ipv6_mangle(struct nf_flow_rule *flow_rule,
+				     unsigned int offset,
+				     u8 *addr, u8 *mask)
+{
+	struct flow_action_entry *entry;
+	int i;
+
+	for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i += sizeof(u32)) {
+		entry = flow_action_entry_next(flow_rule);
+		flow_offload_mangle(entry, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
+				    offset + i,
+				    &addr[i], mask);
+	}
+}
+
+static void flow_offload_ipv6_snat(struct net *net,
+				   const struct flow_offload *flow,
+				   enum flow_offload_tuple_dir dir,
+				   struct nf_flow_rule *flow_rule)
+{
+	u32 mask = ~htonl(0xffffffff);
+	const u8 *addr;
+	u32 offset;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6.s6_addr;
+		offset = offsetof(struct ipv6hdr, saddr);
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6.s6_addr;
+		offset = offsetof(struct ipv6hdr, daddr);
+		break;
+	default:
+		return;
+	}
+
+	flow_offload_ipv6_mangle(flow_rule, offset, (u8 *)addr, (u8 *)&mask);
+}
+
+static void flow_offload_ipv6_dnat(struct net *net,
+				   const struct flow_offload *flow,
+				   enum flow_offload_tuple_dir dir,
+				   struct nf_flow_rule *flow_rule)
+{
+	u32 mask = ~htonl(0xffffffff);
+	const u8 *addr;
+	u32 offset;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6.s6_addr;
+		offset = offsetof(struct ipv6hdr, daddr);
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6.s6_addr;
+		offset = offsetof(struct ipv6hdr, saddr);
+		break;
+	default:
+		return;
+	}
+
+	flow_offload_ipv6_mangle(flow_rule, offset, (u8 *)addr, (u8 *)&mask);
+}
+
 static int flow_offload_l4proto(const struct flow_offload *flow)
 {
 	u8 protonum = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto;
@@ -342,9 +407,9 @@ static void flow_offload_redirect(const struct flow_offload *flow,
 	dev_hold(rt->dst.dev);
 }
 
-int nf_flow_rule_route(struct net *net, const struct flow_offload *flow,
-		       enum flow_offload_tuple_dir dir,
-		       struct nf_flow_rule *flow_rule)
+int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
+			    enum flow_offload_tuple_dir dir,
+			    struct nf_flow_rule *flow_rule)
 {
 	if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
 	    flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
@@ -366,7 +431,32 @@ int nf_flow_rule_route(struct net *net, const struct flow_offload *flow,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(nf_flow_rule_route);
+EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4);
+
+int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
+			    enum flow_offload_tuple_dir dir,
+			    struct nf_flow_rule *flow_rule)
+{
+	if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
+	    flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+		return -1;
+
+	if (flow->flags & FLOW_OFFLOAD_SNAT) {
+		flow_offload_ipv6_snat(net, flow, dir, flow_rule);
+		flow_offload_port_snat(net, flow, dir, flow_rule);
+	}
+	if (flow->flags & FLOW_OFFLOAD_DNAT) {
+		flow_offload_ipv6_dnat(net, flow, dir, flow_rule);
+		flow_offload_port_dnat(net, flow, dir, flow_rule);
+	}
+
+	flow_offload_redirect(flow, dir, flow_rule);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv6);
+
+#define NF_FLOW_RULE_ACTION_MAX	16
 
 static struct nf_flow_rule *
 nf_flow_offload_rule_alloc(struct net *net,
@@ -383,7 +473,7 @@ nf_flow_offload_rule_alloc(struct net *net,
 	if (!flow_rule)
 		goto err_flow;
 
-	flow_rule->rule = flow_rule_alloc(10);
+	flow_rule->rule = flow_rule_alloc(NF_FLOW_RULE_ACTION_MAX);
 	if (!flow_rule->rule)
 		goto err_flow_rule;
 
-- 
cgit v1.2.3-59-g8ed1b


From 91cc1a99740e2ed1d903b5906afb470cc5a07379 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 14 Nov 2019 10:57:15 -0800
Subject: bpf: Annotate context types

Annotate BPF program context types with program-side type and kernel-side type.
This type information is used by the verifier. btf_get_prog_ctx_type() is
used in the later patches to verify that BTF type of ctx in BPF program matches to
kernel expected ctx type. For example, the XDP program type is:
BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, struct xdp_md, struct xdp_buff)
That means that XDP program should be written as:
int xdp_prog(struct xdp_md *ctx) { ... }

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20191114185720.1641606-16-ast@kernel.org
---
 include/linux/bpf.h       |  11 ++++-
 include/linux/bpf_types.h |  78 ++++++++++++++++++++-----------
 kernel/bpf/btf.c          | 114 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/syscall.c      |   4 +-
 kernel/bpf/verifier.c     |   2 +-
 net/core/filter.c         |  10 ----
 6 files changed, 176 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cb5a356381f5..9c48f11fe56e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -747,7 +747,7 @@ DECLARE_PER_CPU(int, bpf_prog_active);
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 
-#define BPF_PROG_TYPE(_id, _name) \
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
 	extern const struct bpf_verifier_ops _name ## _verifier_ops;
 #define BPF_MAP_TYPE(_id, _ops) \
@@ -1213,6 +1213,15 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 #endif
 
 #ifdef CONFIG_INET
+struct sk_reuseport_kern {
+	struct sk_buff *skb;
+	struct sock *sk;
+	struct sock *selected_sk;
+	void *data_end;
+	u32 hash;
+	u32 reuseport_id;
+	bool bind_inany;
+};
 bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 				  struct bpf_insn_access_aux *info);
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index de14872b01ba..93740b3614d7 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -2,42 +2,68 @@
 /* internal file - do not include directly */
 
 #ifdef CONFIG_NET
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act)
-BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter,
+	      struct __sk_buff, struct sk_buff)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act,
+	      struct __sk_buff, struct sk_buff)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act,
+	      struct __sk_buff, struct sk_buff)
+BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp,
+	      struct xdp_md, struct xdp_buff)
 #ifdef CONFIG_CGROUP_BPF
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb,
+	      struct __sk_buff, struct sk_buff)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock,
+	      struct bpf_sock, struct sock)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr,
+	      struct bpf_sock_addr, struct bpf_sock_addr_kern)
 #endif
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
-BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in,
+	      struct __sk_buff, struct sk_buff)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out,
+	      struct __sk_buff, struct sk_buff)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit,
+	      struct __sk_buff, struct sk_buff)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local,
+	      struct __sk_buff, struct sk_buff)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops,
+	      struct bpf_sock_ops, struct bpf_sock_ops_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb,
+	      struct __sk_buff, struct sk_buff)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg,
+	      struct sk_msg_md, struct sk_msg)
+BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector,
+	      struct __sk_buff, struct bpf_flow_dissector)
 #endif
 #ifdef CONFIG_BPF_EVENTS
-BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
-BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
-BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
-BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
-BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable)
-BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing)
+BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe,
+	      bpf_user_pt_regs_t, struct pt_regs)
+BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint,
+	      __u64, u64)
+BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event,
+	      struct bpf_perf_event_data, struct bpf_perf_event_data_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint,
+	      struct bpf_raw_tracepoint_args, u64)
+BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable,
+	      struct bpf_raw_tracepoint_args, u64)
+BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing,
+	      void *, void *)
 #endif
 #ifdef CONFIG_CGROUP_BPF
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev,
+	      struct bpf_cgroup_dev_ctx, struct bpf_cgroup_dev_ctx)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
+	      struct bpf_sysctl, struct bpf_sysctl_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
+	      struct bpf_sockopt, struct bpf_sockopt_kern)
 #endif
 #ifdef CONFIG_BPF_LIRC_MODE2
-BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
+	      __u32, u32)
 #endif
 #ifdef CONFIG_INET
-BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport,
+	      struct sk_reuseport_md, struct sk_reuseport_kern)
 #endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 033d071eb59c..4b7c8bd423d6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2,6 +2,8 @@
 /* Copyright (c) 2018 Facebook */
 
 #include <uapi/linux/btf.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/bpf_perf_event.h>
 #include <uapi/linux/types.h>
 #include <linux/seq_file.h>
 #include <linux/compiler.h>
@@ -16,6 +18,9 @@
 #include <linux/sort.h>
 #include <linux/bpf_verifier.h>
 #include <linux/btf.h>
+#include <linux/skmsg.h>
+#include <linux/perf_event.h>
+#include <net/sock.h>
 
 /* BTF (BPF Type Format) is the meta data format which describes
  * the data types of BPF program/map.  Hence, it basically focus
@@ -3439,13 +3444,98 @@ errout:
 
 extern char __weak _binary__btf_vmlinux_bin_start[];
 extern char __weak _binary__btf_vmlinux_bin_end[];
+extern struct btf *btf_vmlinux;
+
+#define BPF_MAP_TYPE(_id, _ops)
+static union {
+	struct bpf_ctx_convert {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+	prog_ctx_type _id##_prog; \
+	kern_ctx_type _id##_kern;
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+	} *__t;
+	/* 't' is written once under lock. Read many times. */
+	const struct btf_type *t;
+} bpf_ctx_convert;
+enum {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+	__ctx_convert##_id,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+};
+static u8 bpf_ctx_convert_map[] = {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
+	[_id] = __ctx_convert##_id,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+};
+#undef BPF_MAP_TYPE
+
+static const struct btf_member *
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
+		      const struct btf_type *t, enum bpf_prog_type prog_type)
+{
+	const struct btf_type *conv_struct;
+	const struct btf_type *ctx_struct;
+	const struct btf_member *ctx_type;
+	const char *tname, *ctx_tname;
+
+	conv_struct = bpf_ctx_convert.t;
+	if (!conv_struct) {
+		bpf_log(log, "btf_vmlinux is malformed\n");
+		return NULL;
+	}
+	t = btf_type_by_id(btf, t->type);
+	while (btf_type_is_modifier(t))
+		t = btf_type_by_id(btf, t->type);
+	if (!btf_type_is_struct(t)) {
+		/* Only pointer to struct is supported for now.
+		 * That means that BPF_PROG_TYPE_TRACEPOINT with BTF
+		 * is not supported yet.
+		 * BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
+		 */
+		bpf_log(log, "BPF program ctx type is not a struct\n");
+		return NULL;
+	}
+	tname = btf_name_by_offset(btf, t->name_off);
+	if (!tname) {
+		bpf_log(log, "BPF program ctx struct doesn't have a name\n");
+		return NULL;
+	}
+	/* prog_type is valid bpf program type. No need for bounds check. */
+	ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2;
+	/* ctx_struct is a pointer to prog_ctx_type in vmlinux.
+	 * Like 'struct __sk_buff'
+	 */
+	ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type);
+	if (!ctx_struct)
+		/* should not happen */
+		return NULL;
+	ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);
+	if (!ctx_tname) {
+		/* should not happen */
+		bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n");
+		return NULL;
+	}
+	/* only compare that prog's ctx type name is the same as
+	 * kernel expects. No need to compare field by field.
+	 * It's ok for bpf prog to do:
+	 * struct __sk_buff {};
+	 * int socket_filter_bpf_prog(struct __sk_buff *skb)
+	 * { // no fields of skb are ever used }
+	 */
+	if (strcmp(ctx_tname, tname))
+		return NULL;
+	return ctx_type;
+}
 
 struct btf *btf_parse_vmlinux(void)
 {
 	struct btf_verifier_env *env = NULL;
 	struct bpf_verifier_log *log;
 	struct btf *btf = NULL;
-	int err;
+	int err, i;
 
 	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
 	if (!env)
@@ -3479,6 +3569,26 @@ struct btf *btf_parse_vmlinux(void)
 	if (err)
 		goto errout;
 
+	/* find struct bpf_ctx_convert for type checking later */
+	for (i = 1; i <= btf->nr_types; i++) {
+		const struct btf_type *t;
+		const char *tname;
+
+		t = btf_type_by_id(btf, i);
+		if (!__btf_type_is_struct(t))
+			continue;
+		tname = __btf_name_by_offset(btf, t->name_off);
+		if (!strcmp(tname, "bpf_ctx_convert")) {
+			/* btf_parse_vmlinux() runs under bpf_verifier_lock */
+			bpf_ctx_convert.t = t;
+			break;
+		}
+	}
+	if (i > btf->nr_types) {
+		err = -ENOENT;
+		goto errout;
+	}
+
 	btf_verifier_env_free(env);
 	refcount_set(&btf->refcnt, 1);
 	return btf;
@@ -3492,8 +3602,6 @@ errout:
 	return ERR_PTR(err);
 }
 
-extern struct btf *btf_vmlinux;
-
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e2e37bea86bc..05a0ee75eca0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(map_idr_lock);
 int sysctl_unprivileged_bpf_disabled __read_mostly;
 
 static const struct bpf_map_ops * const bpf_map_types[] = {
-#define BPF_PROG_TYPE(_id, _ops)
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
 #define BPF_MAP_TYPE(_id, _ops) \
 	[_id] = &_ops,
 #include <linux/bpf_types.h>
@@ -1189,7 +1189,7 @@ err_put:
 }
 
 static const struct bpf_prog_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _name) \
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
 	[_id] = & _name ## _prog_ops,
 #define BPF_MAP_TYPE(_id, _ops)
 #include <linux/bpf_types.h>
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e78ec7990767..7395d6bebefd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,7 +23,7 @@
 #include "disasm.h"
 
 static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
-#define BPF_PROG_TYPE(_id, _name) \
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
 	[_id] = & _name ## _verifier_ops,
 #define BPF_MAP_TYPE(_id, _ops)
 #include <linux/bpf_types.h>
diff --git a/net/core/filter.c b/net/core/filter.c
index f72face90659..49ded4a7588a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8684,16 +8684,6 @@ out:
 }
 
 #ifdef CONFIG_INET
-struct sk_reuseport_kern {
-	struct sk_buff *skb;
-	struct sock *sk;
-	struct sock *selected_sk;
-	void *data_end;
-	u32 hash;
-	u32 reuseport_id;
-	bool bind_inany;
-};
-
 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
 				    struct sock_reuseport *reuse,
 				    struct sock *sk, struct sk_buff *skb,
-- 
cgit v1.2.3-59-g8ed1b


From 8c1b6e69dcc1e11bd24111e3734dd740aaf3fda1 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 14 Nov 2019 10:57:16 -0800
Subject: bpf: Compare BTF types of functions arguments with actual types

Make the verifier check that BTF types of function arguments match actual types
passed into top-level BPF program and into BPF-to-BPF calls. If types match
such BPF programs and sub-programs will have full support of BPF trampoline. If
types mismatch the trampoline has to be conservative. It has to save/restore
five program arguments and assume 64-bit scalars.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20191114185720.1641606-17-ast@kernel.org
---
 include/linux/bpf.h          |  8 +++++
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/btf.c             | 82 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c         |  1 +
 kernel/bpf/verifier.c        | 18 ++++++++--
 5 files changed, 107 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9c48f11fe56e..c70bf04726b4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -480,6 +480,10 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
 static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
 #endif
 
+struct bpf_func_info_aux {
+	bool unreliable;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -494,6 +498,7 @@ struct bpf_prog_aux {
 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
 	bool offload_requested;
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
+	bool func_proto_unreliable;
 	enum bpf_tramp_prog_type trampoline_prog_type;
 	struct bpf_trampoline *trampoline;
 	struct hlist_node tramp_hlist;
@@ -518,6 +523,7 @@ struct bpf_prog_aux {
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
+	struct bpf_func_info_aux *func_info_aux;
 	/* bpf_line_info loaded from userspace.  linfo->insn_off
 	 * has the xlated insn offset.
 	 * Both the main and sub prog share the same linfo.
@@ -890,6 +896,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   const char *func_name,
 			   struct btf_func_model *m);
 
+int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 6e7284ea1468..cdd08bf0ec06 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -343,6 +343,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 #define BPF_MAX_SUBPROGS 256
 
 struct bpf_subprog_info {
+	/* 'start' has to be the first field otherwise find_subprog() won't work */
 	u32 start; /* insn idx of function entry point */
 	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
 	u16 stack_depth; /* max. stack depth used by this function */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4b7c8bd423d6..4620267b186e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3985,6 +3985,88 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 	return 0;
 }
 
+int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog)
+{
+	struct bpf_verifier_state *st = env->cur_state;
+	struct bpf_func_state *func = st->frame[st->curframe];
+	struct bpf_reg_state *reg = func->regs;
+	struct bpf_verifier_log *log = &env->log;
+	struct bpf_prog *prog = env->prog;
+	struct btf *btf = prog->aux->btf;
+	const struct btf_param *args;
+	const struct btf_type *t;
+	u32 i, nargs, btf_id;
+	const char *tname;
+
+	if (!prog->aux->func_info)
+		return 0;
+
+	btf_id = prog->aux->func_info[subprog].type_id;
+	if (!btf_id)
+		return 0;
+
+	if (prog->aux->func_info_aux[subprog].unreliable)
+		return 0;
+
+	t = btf_type_by_id(btf, btf_id);
+	if (!t || !btf_type_is_func(t)) {
+		bpf_log(log, "BTF of subprog %d doesn't point to KIND_FUNC\n",
+			subprog);
+		return -EINVAL;
+	}
+	tname = btf_name_by_offset(btf, t->name_off);
+
+	t = btf_type_by_id(btf, t->type);
+	if (!t || !btf_type_is_func_proto(t)) {
+		bpf_log(log, "Invalid type of func %s\n", tname);
+		return -EINVAL;
+	}
+	args = (const struct btf_param *)(t + 1);
+	nargs = btf_type_vlen(t);
+	if (nargs > 5) {
+		bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
+		goto out;
+	}
+	/* check that BTF function arguments match actual types that the
+	 * verifier sees.
+	 */
+	for (i = 0; i < nargs; i++) {
+		t = btf_type_by_id(btf, args[i].type);
+		while (btf_type_is_modifier(t))
+			t = btf_type_by_id(btf, t->type);
+		if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+			if (reg[i + 1].type == SCALAR_VALUE)
+				continue;
+			bpf_log(log, "R%d is not a scalar\n", i + 1);
+			goto out;
+		}
+		if (btf_type_is_ptr(t)) {
+			if (reg[i + 1].type == SCALAR_VALUE) {
+				bpf_log(log, "R%d is not a pointer\n", i + 1);
+				goto out;
+			}
+			/* If program is passing PTR_TO_CTX into subprogram
+			 * check that BTF type matches.
+			 */
+			if (reg[i + 1].type == PTR_TO_CTX &&
+			    !btf_get_prog_ctx_type(log, btf, t, prog->type))
+				goto out;
+			/* All other pointers are ok */
+			continue;
+		}
+		bpf_log(log, "Unrecognized argument type %s\n",
+			btf_kind_str[BTF_INFO_KIND(t->info)]);
+		goto out;
+	}
+	return 0;
+out:
+	/* LLVM optimizations can remove arguments from static functions. */
+	bpf_log(log,
+		"Type info disagrees with actual arguments due to compiler optimizations\n");
+	prog->aux->func_info_aux[subprog].unreliable = true;
+	return 0;
+}
+
 void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 		       struct seq_file *m)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 05a0ee75eca0..43ba647de720 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1328,6 +1328,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
 
 	kvfree(aux->func_info);
+	kfree(aux->func_info_aux);
 	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
 	security_bpf_prog_free(aux);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7395d6bebefd..11910149ca2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3970,6 +3970,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	/* only increment it after check_reg_arg() finished */
 	state->curframe++;
 
+	if (btf_check_func_arg_match(env, subprog))
+		return -EINVAL;
+
 	/* and go analyze first insn of the callee */
 	*insn_idx = target_insn;
 
@@ -6564,6 +6567,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
 	u32 i, nfuncs, urec_size, min_size;
 	u32 krec_size = sizeof(struct bpf_func_info);
 	struct bpf_func_info *krecord;
+	struct bpf_func_info_aux *info_aux = NULL;
 	const struct btf_type *type;
 	struct bpf_prog *prog;
 	const struct btf *btf;
@@ -6597,6 +6601,9 @@ static int check_btf_func(struct bpf_verifier_env *env,
 	krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
 	if (!krecord)
 		return -ENOMEM;
+	info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
+	if (!info_aux)
+		goto err_free;
 
 	for (i = 0; i < nfuncs; i++) {
 		ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size);
@@ -6648,29 +6655,31 @@ static int check_btf_func(struct bpf_verifier_env *env,
 			ret = -EINVAL;
 			goto err_free;
 		}
-
 		prev_offset = krecord[i].insn_off;
 		urecord += urec_size;
 	}
 
 	prog->aux->func_info = krecord;
 	prog->aux->func_info_cnt = nfuncs;
+	prog->aux->func_info_aux = info_aux;
 	return 0;
 
 err_free:
 	kvfree(krecord);
+	kfree(info_aux);
 	return ret;
 }
 
 static void adjust_btf_func(struct bpf_verifier_env *env)
 {
+	struct bpf_prog_aux *aux = env->prog->aux;
 	int i;
 
-	if (!env->prog->aux->func_info)
+	if (!aux->func_info)
 		return;
 
 	for (i = 0; i < env->subprog_cnt; i++)
-		env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start;
+		aux->func_info[i].insn_off = env->subprog_info[i].start;
 }
 
 #define MIN_BPF_LINEINFO_SIZE	(offsetof(struct bpf_line_info, line_col) + \
@@ -7651,6 +7660,9 @@ static int do_check(struct bpf_verifier_env *env)
 			0 /* frameno */,
 			0 /* subprogno, zero == main subprog */);
 
+	if (btf_check_func_arg_match(env, 0))
+		return -EINVAL;
+
 	for (;;) {
 		struct bpf_insn *insn;
 		u8 class;
-- 
cgit v1.2.3-59-g8ed1b


From 5b92a28aae4dd0f88778d540ecfdcdaec5a41723 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 14 Nov 2019 10:57:17 -0800
Subject: bpf: Support attaching tracing BPF program to other BPF programs

Allow FENTRY/FEXIT BPF programs to attach to other BPF programs of any type
including their subprograms. This feature allows snooping on input and output
packets in XDP, TC programs including their return values. In order to do that
the verifier needs to track types not only of vmlinux, but types of other BPF
programs as well. The verifier also needs to translate uapi/linux/bpf.h types
used by networking programs into kernel internal BTF types used by FENTRY/FEXIT
BPF programs. In some cases LLVM optimizations can remove arguments from BPF
subprograms without adjusting BTF info that LLVM backend knows. When BTF info
disagrees with actual types that the verifiers sees the BPF trampoline has to
fallback to conservative and treat all arguments as u64. The FENTRY/FEXIT
program can still attach to such subprograms, but it won't be able to recognize
pointer types like 'struct sk_buff *' and it won't be able to pass them to
bpf_skb_output() for dumping packets to user space. The FENTRY/FEXIT program
would need to use bpf_probe_read_kernel() instead.

The BPF_PROG_LOAD command is extended with attach_prog_fd field. When it's set
to zero the attach_btf_id is one vmlinux BTF type ids. When attach_prog_fd
points to previously loaded BPF program the attach_btf_id is BTF type id of
main function or one of its subprograms.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20191114185720.1641606-18-ast@kernel.org
---
 arch/x86/net/bpf_jit_comp.c |  3 +-
 include/linux/bpf.h         |  1 +
 include/linux/btf.h         |  1 +
 include/uapi/linux/bpf.h    |  1 +
 kernel/bpf/btf.c            | 70 +++++++++++++++++++++++++++++++++-----
 kernel/bpf/core.c           |  2 ++
 kernel/bpf/syscall.c        | 19 ++++++++---
 kernel/bpf/verifier.c       | 83 +++++++++++++++++++++++++++++++++++++--------
 8 files changed, 152 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index c06096df9118..2e586f579945 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -504,7 +504,8 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 	u8 *prog;
 	int ret;
 
-	if (!is_kernel_text((long)ip))
+	if (!is_kernel_text((long)ip) &&
+	    !is_bpf_text_address((long)ip))
 		/* BPF trampoline in modules is not supported */
 		return -EINVAL;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c70bf04726b4..5b81cde47314 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -495,6 +495,7 @@ struct bpf_prog_aux {
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
+	struct bpf_prog *linked_prog;
 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
 	bool offload_requested;
 	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 9dee00859c5f..79d4abc2556a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -88,6 +88,7 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t)
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
 struct btf *btf_parse_vmlinux(void);
+struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
 #else
 static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
 						    u32 type_id)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 69c200e6e696..4842a134b202 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -425,6 +425,7 @@ union bpf_attr {
 		__aligned_u64	line_info;	/* line info */
 		__u32		line_info_cnt;	/* number of bpf_line_info records */
 		__u32		attach_btf_id;	/* in-kernel BTF type id to attach to */
+		__u32		attach_prog_fd; /* 0 to attach to vmlinux */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4620267b186e..40efde5eedcb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3530,6 +3530,20 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
 	return ctx_type;
 }
 
+static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
+				     struct btf *btf,
+				     const struct btf_type *t,
+				     enum bpf_prog_type prog_type)
+{
+	const struct btf_member *prog_ctx_type, *kern_ctx_type;
+
+	prog_ctx_type = btf_get_prog_ctx_type(log, btf, t, prog_type);
+	if (!prog_ctx_type)
+		return -ENOENT;
+	kern_ctx_type = prog_ctx_type + 1;
+	return kern_ctx_type->type;
+}
+
 struct btf *btf_parse_vmlinux(void)
 {
 	struct btf_verifier_env *env = NULL;
@@ -3602,15 +3616,29 @@ errout:
 	return ERR_PTR(err);
 }
 
+struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
+{
+	struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+
+	if (tgt_prog) {
+		return tgt_prog->aux->btf;
+	} else {
+		return btf_vmlinux;
+	}
+}
+
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info)
 {
 	const struct btf_type *t = prog->aux->attach_func_proto;
+	struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+	struct btf *btf = bpf_prog_get_target_btf(prog);
 	const char *tname = prog->aux->attach_func_name;
 	struct bpf_verifier_log *log = info->log;
 	const struct btf_param *args;
 	u32 nr_args, arg;
+	int ret;
 
 	if (off % 8) {
 		bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
@@ -3619,7 +3647,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	}
 	arg = off / 8;
 	args = (const struct btf_param *)(t + 1);
-	nr_args = btf_type_vlen(t);
+	/* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */
+	nr_args = t ? btf_type_vlen(t) : 5;
 	if (prog->aux->attach_btf_trace) {
 		/* skip first 'void *__data' argument in btf_trace_##name typedef */
 		args++;
@@ -3628,18 +3657,24 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 
 	if (prog->expected_attach_type == BPF_TRACE_FEXIT &&
 	    arg == nr_args) {
+		if (!t)
+			/* Default prog with 5 args. 6th arg is retval. */
+			return true;
 		/* function return type */
-		t = btf_type_by_id(btf_vmlinux, t->type);
+		t = btf_type_by_id(btf, t->type);
 	} else if (arg >= nr_args) {
 		bpf_log(log, "func '%s' doesn't have %d-th argument\n",
 			tname, arg + 1);
 		return false;
 	} else {
-		t = btf_type_by_id(btf_vmlinux, args[arg].type);
+		if (!t)
+			/* Default prog with 5 args */
+			return true;
+		t = btf_type_by_id(btf, args[arg].type);
 	}
 	/* skip modifiers */
 	while (btf_type_is_modifier(t))
-		t = btf_type_by_id(btf_vmlinux, t->type);
+		t = btf_type_by_id(btf, t->type);
 	if (btf_type_is_int(t))
 		/* accessing a scalar */
 		return true;
@@ -3647,7 +3682,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		bpf_log(log,
 			"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
 			tname, arg,
-			__btf_name_by_offset(btf_vmlinux, t->name_off),
+			__btf_name_by_offset(btf, t->name_off),
 			btf_kind_str[BTF_INFO_KIND(t->info)]);
 		return false;
 	}
@@ -3662,10 +3697,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	info->reg_type = PTR_TO_BTF_ID;
 	info->btf_id = t->type;
 
-	t = btf_type_by_id(btf_vmlinux, t->type);
+	if (tgt_prog) {
+		ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type);
+		if (ret > 0) {
+			info->btf_id = ret;
+			return true;
+		} else {
+			return false;
+		}
+	}
+	t = btf_type_by_id(btf, t->type);
 	/* skip modifiers */
 	while (btf_type_is_modifier(t))
-		t = btf_type_by_id(btf_vmlinux, t->type);
+		t = btf_type_by_id(btf, t->type);
 	if (!btf_type_is_struct(t)) {
 		bpf_log(log,
 			"func '%s' arg%d type %s is not a struct\n",
@@ -3674,7 +3718,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	}
 	bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
 		tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
-		__btf_name_by_offset(btf_vmlinux, t->name_off));
+		__btf_name_by_offset(btf, t->name_off));
 	return true;
 }
 
@@ -3954,6 +3998,16 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 	u32 i, nargs;
 	int ret;
 
+	if (!func) {
+		/* BTF function prototype doesn't match the verifier types.
+		 * Fall back to 5 u64 args.
+		 */
+		for (i = 0; i < 5; i++)
+			m->arg_size[i] = 8;
+		m->ret_size = 8;
+		m->nr_args = 5;
+		return 0;
+	}
 	args = (const struct btf_param *)(func + 1);
 	nargs = btf_type_vlen(func);
 	if (nargs >= MAX_BPF_FUNC_ARGS) {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index da5a8b8e278f..b5945c3aaa8e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2031,6 +2031,8 @@ void bpf_prog_free(struct bpf_prog *fp)
 {
 	struct bpf_prog_aux *aux = fp->aux;
 
+	if (aux->linked_prog)
+		bpf_prog_put(aux->linked_prog);
 	INIT_WORK(&aux->work, bpf_prog_free_deferred);
 	schedule_work(&aux->work);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 43ba647de720..c88c815c2154 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1577,7 +1577,7 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
 static int
 bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 			   enum bpf_attach_type expected_attach_type,
-			   u32 btf_id)
+			   u32 btf_id, u32 prog_fd)
 {
 	switch (prog_type) {
 	case BPF_PROG_TYPE_TRACING:
@@ -1585,7 +1585,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 			return -EINVAL;
 		break;
 	default:
-		if (btf_id)
+		if (btf_id || prog_fd)
 			return -EINVAL;
 		break;
 	}
@@ -1636,7 +1636,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD attach_btf_id
+#define	BPF_PROG_LOAD_LAST_FIELD attach_prog_fd
 
 static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 {
@@ -1679,7 +1679,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	bpf_prog_load_fixup_attach_type(attr);
 	if (bpf_prog_load_check_attach(type, attr->expected_attach_type,
-				       attr->attach_btf_id))
+				       attr->attach_btf_id,
+				       attr->attach_prog_fd))
 		return -EINVAL;
 
 	/* plain bpf_prog allocation */
@@ -1689,6 +1690,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	prog->expected_attach_type = attr->expected_attach_type;
 	prog->aux->attach_btf_id = attr->attach_btf_id;
+	if (attr->attach_prog_fd) {
+		struct bpf_prog *tgt_prog;
+
+		tgt_prog = bpf_prog_get(attr->attach_prog_fd);
+		if (IS_ERR(tgt_prog)) {
+			err = PTR_ERR(tgt_prog);
+			goto free_prog_nouncharge;
+		}
+		prog->aux->linked_prog = tgt_prog;
+	}
 
 	prog->aux->offload_requested = !!attr->prog_ifindex;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 11910149ca2f..e9dc95a18d44 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9390,13 +9390,17 @@ static void print_verification_stats(struct bpf_verifier_env *env)
 static int check_attach_btf_id(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog;
+	struct bpf_prog *tgt_prog = prog->aux->linked_prog;
 	u32 btf_id = prog->aux->attach_btf_id;
 	const char prefix[] = "btf_trace_";
+	int ret = 0, subprog = -1, i;
 	struct bpf_trampoline *tr;
 	const struct btf_type *t;
+	bool conservative = true;
 	const char *tname;
-	int ret = 0;
+	struct btf *btf;
 	long addr;
+	u64 key;
 
 	if (prog->type != BPF_PROG_TYPE_TRACING)
 		return 0;
@@ -9405,19 +9409,47 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		verbose(env, "Tracing programs must provide btf_id\n");
 		return -EINVAL;
 	}
-	t = btf_type_by_id(btf_vmlinux, btf_id);
+	btf = bpf_prog_get_target_btf(prog);
+	if (!btf) {
+		verbose(env,
+			"FENTRY/FEXIT program can only be attached to another program annotated with BTF\n");
+		return -EINVAL;
+	}
+	t = btf_type_by_id(btf, btf_id);
 	if (!t) {
 		verbose(env, "attach_btf_id %u is invalid\n", btf_id);
 		return -EINVAL;
 	}
-	tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+	tname = btf_name_by_offset(btf, t->name_off);
 	if (!tname) {
 		verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id);
 		return -EINVAL;
 	}
+	if (tgt_prog) {
+		struct bpf_prog_aux *aux = tgt_prog->aux;
+
+		for (i = 0; i < aux->func_info_cnt; i++)
+			if (aux->func_info[i].type_id == btf_id) {
+				subprog = i;
+				break;
+			}
+		if (subprog == -1) {
+			verbose(env, "Subprog %s doesn't exist\n", tname);
+			return -EINVAL;
+		}
+		conservative = aux->func_info_aux[subprog].unreliable;
+		key = ((u64)aux->id) << 32 | btf_id;
+	} else {
+		key = btf_id;
+	}
 
 	switch (prog->expected_attach_type) {
 	case BPF_TRACE_RAW_TP:
+		if (tgt_prog) {
+			verbose(env,
+				"Only FENTRY/FEXIT progs are attachable to another BPF prog\n");
+			return -EINVAL;
+		}
 		if (!btf_type_is_typedef(t)) {
 			verbose(env, "attach_btf_id %u is not a typedef\n",
 				btf_id);
@@ -9429,11 +9461,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 			return -EINVAL;
 		}
 		tname += sizeof(prefix) - 1;
-		t = btf_type_by_id(btf_vmlinux, t->type);
+		t = btf_type_by_id(btf, t->type);
 		if (!btf_type_is_ptr(t))
 			/* should never happen in valid vmlinux build */
 			return -EINVAL;
-		t = btf_type_by_id(btf_vmlinux, t->type);
+		t = btf_type_by_id(btf, t->type);
 		if (!btf_type_is_func_proto(t))
 			/* should never happen in valid vmlinux build */
 			return -EINVAL;
@@ -9452,30 +9484,51 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 				btf_id);
 			return -EINVAL;
 		}
-		t = btf_type_by_id(btf_vmlinux, t->type);
+		t = btf_type_by_id(btf, t->type);
 		if (!btf_type_is_func_proto(t))
 			return -EINVAL;
-		tr = bpf_trampoline_lookup(btf_id);
+		tr = bpf_trampoline_lookup(key);
 		if (!tr)
 			return -ENOMEM;
 		prog->aux->attach_func_name = tname;
+		/* t is either vmlinux type or another program's type */
 		prog->aux->attach_func_proto = t;
 		mutex_lock(&tr->mutex);
 		if (tr->func.addr) {
 			prog->aux->trampoline = tr;
 			goto out;
 		}
-		ret = btf_distill_func_proto(&env->log, btf_vmlinux, t,
+		if (tgt_prog && conservative) {
+			prog->aux->attach_func_proto = NULL;
+			t = NULL;
+		}
+		ret = btf_distill_func_proto(&env->log, btf, t,
 					     tname, &tr->func.model);
 		if (ret < 0)
 			goto out;
-		addr = kallsyms_lookup_name(tname);
-		if (!addr) {
-			verbose(env,
-				"The address of function %s cannot be found\n",
-				tname);
-			ret = -ENOENT;
-			goto out;
+		if (tgt_prog) {
+			if (!tgt_prog->jited) {
+				/* for now */
+				verbose(env, "Can trace only JITed BPF progs\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			if (tgt_prog->type == BPF_PROG_TYPE_TRACING) {
+				/* prevent cycles */
+				verbose(env, "Cannot recursively attach\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
+		} else {
+			addr = kallsyms_lookup_name(tname);
+			if (!addr) {
+				verbose(env,
+					"The address of function %s cannot be found\n",
+					tname);
+				ret = -ENOENT;
+				goto out;
+			}
 		}
 		tr->func.addr = (void *)addr;
 		prog->aux->trampoline = tr;
-- 
cgit v1.2.3-59-g8ed1b


From c3f812cea0d7006469d1cf33a4a9f0a12bb4b3a3 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Thu, 14 Nov 2019 14:13:00 -0800
Subject: page_pool: do not release pool until inflight == 0.

The page pool keeps track of the number of pages in flight, and
it isn't safe to remove the pool until all pages are returned.

Disallow removing the pool until all pages are back, so the pool
is always available for page producers.

Make the page pool responsible for its own delayed destruction
instead of relying on XDP, so the page pool can be used without
the xdp memory model.

When all pages are returned, free the pool and notify xdp if the
pool is registered with the xdp memory system.  Have the callback
perform a table walk since some drivers (cpsw) may share the pool
among multiple xdp_rxq_info.

Note that the increment of pages_state_release_cnt may result in
inflight == 0, resulting in the pool being released.

Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning")
Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |   4 +-
 include/net/page_pool.h                           |  52 +++------
 include/net/xdp_priv.h                            |   4 -
 include/trace/events/xdp.h                        |  19 +---
 net/core/page_pool.c                              | 122 ++++++++++++++--------
 net/core/xdp.c                                    | 121 ++++++++-------------
 6 files changed, 139 insertions(+), 183 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 4ba250a9008f..8cc4cd0cc515 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1503,10 +1503,8 @@ static void free_dma_rx_desc_resources(struct stmmac_priv *priv)
 					  rx_q->dma_erx, rx_q->dma_rx_phy);
 
 		kfree(rx_q->buf_pool);
-		if (rx_q->page_pool) {
-			page_pool_request_shutdown(rx_q->page_pool);
+		if (rx_q->page_pool)
 			page_pool_destroy(rx_q->page_pool);
-		}
 	}
 }
 
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 2cbcdbdec254..1121faa99c12 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -70,7 +70,12 @@ struct page_pool_params {
 struct page_pool {
 	struct page_pool_params p;
 
-        u32 pages_state_hold_cnt;
+	struct delayed_work release_dw;
+	void (*disconnect)(void *);
+	unsigned long defer_start;
+	unsigned long defer_warn;
+
+	u32 pages_state_hold_cnt;
 
 	/*
 	 * Data structure for allocation side
@@ -129,25 +134,19 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
 
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
-void __page_pool_free(struct page_pool *pool);
-static inline void page_pool_free(struct page_pool *pool)
-{
-	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
-	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
-	 */
 #ifdef CONFIG_PAGE_POOL
-	__page_pool_free(pool);
-#endif
-}
-
-/* Drivers use this instead of page_pool_free */
+void page_pool_destroy(struct page_pool *pool);
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *));
+#else
 static inline void page_pool_destroy(struct page_pool *pool)
 {
-	if (!pool)
-		return;
+}
 
-	page_pool_free(pool);
+static inline void page_pool_use_xdp_mem(struct page_pool *pool,
+					 void (*disconnect)(void *))
+{
 }
+#endif
 
 /* Never call this directly, use helpers below */
 void __page_pool_put_page(struct page_pool *pool,
@@ -170,24 +169,6 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 	__page_pool_put_page(pool, page, true);
 }
 
-/* API user MUST have disconnected alloc-side (not allowed to call
- * page_pool_alloc_pages()) before calling this.  The free-side can
- * still run concurrently, to handle in-flight packet-pages.
- *
- * A request to shutdown can fail (with false) if there are still
- * in-flight packet-pages.
- */
-bool __page_pool_request_shutdown(struct page_pool *pool);
-static inline bool page_pool_request_shutdown(struct page_pool *pool)
-{
-	bool safe_to_remove = false;
-
-#ifdef CONFIG_PAGE_POOL
-	safe_to_remove = __page_pool_request_shutdown(pool);
-#endif
-	return safe_to_remove;
-}
-
 /* Disconnects a page (from a page_pool).  API users can have a need
  * to disconnect a page (from a page_pool), to allow it to be used as
  * a regular page (that will eventually be returned to the normal
@@ -216,11 +197,6 @@ static inline bool is_page_pool_compiled_in(void)
 #endif
 }
 
-static inline void page_pool_get(struct page_pool *pool)
-{
-	refcount_inc(&pool->user_cnt);
-}
-
 static inline bool page_pool_put(struct page_pool *pool)
 {
 	return refcount_dec_and_test(&pool->user_cnt);
diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h
index 6a8cba6ea79a..a9d5b7603b89 100644
--- a/include/net/xdp_priv.h
+++ b/include/net/xdp_priv.h
@@ -12,12 +12,8 @@ struct xdp_mem_allocator {
 		struct page_pool *page_pool;
 		struct zero_copy_allocator *zc_alloc;
 	};
-	int disconnect_cnt;
-	unsigned long defer_start;
 	struct rhash_head node;
 	struct rcu_head rcu;
-	struct delayed_work defer_wq;
-	unsigned long defer_warn;
 };
 
 #endif /* __LINUX_NET_XDP_PRIV_H__ */
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index c7e3c9c5bad3..a7378bcd9928 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -317,19 +317,15 @@ __MEM_TYPE_MAP(__MEM_TYPE_TP_FN)
 
 TRACE_EVENT(mem_disconnect,
 
-	TP_PROTO(const struct xdp_mem_allocator *xa,
-		 bool safe_to_remove, bool force),
+	TP_PROTO(const struct xdp_mem_allocator *xa),
 
-	TP_ARGS(xa, safe_to_remove, force),
+	TP_ARGS(xa),
 
 	TP_STRUCT__entry(
 		__field(const struct xdp_mem_allocator *,	xa)
 		__field(u32,		mem_id)
 		__field(u32,		mem_type)
 		__field(const void *,	allocator)
-		__field(bool,		safe_to_remove)
-		__field(bool,		force)
-		__field(int,		disconnect_cnt)
 	),
 
 	TP_fast_assign(
@@ -337,19 +333,12 @@ TRACE_EVENT(mem_disconnect,
 		__entry->mem_id		= xa->mem.id;
 		__entry->mem_type	= xa->mem.type;
 		__entry->allocator	= xa->allocator;
-		__entry->safe_to_remove	= safe_to_remove;
-		__entry->force		= force;
-		__entry->disconnect_cnt	= xa->disconnect_cnt;
 	),
 
-	TP_printk("mem_id=%d mem_type=%s allocator=%p"
-		  " safe_to_remove=%s force=%s disconnect_cnt=%d",
+	TP_printk("mem_id=%d mem_type=%s allocator=%p",
 		  __entry->mem_id,
 		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
-		  __entry->allocator,
-		  __entry->safe_to_remove ? "true" : "false",
-		  __entry->force ? "true" : "false",
-		  __entry->disconnect_cnt
+		  __entry->allocator
 	)
 );
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5bc65587f1c4..dfc2501c35d9 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -18,6 +18,9 @@
 
 #include <trace/events/page_pool.h>
 
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (60 * HZ)
+
 static int page_pool_init(struct page_pool *pool,
 			  const struct page_pool_params *params)
 {
@@ -193,22 +196,14 @@ static s32 page_pool_inflight(struct page_pool *pool)
 {
 	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
-	s32 distance;
-
-	distance = _distance(hold_cnt, release_cnt);
-
-	trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
-	return distance;
-}
+	s32 inflight;
 
-static bool __page_pool_safe_to_destroy(struct page_pool *pool)
-{
-	s32 inflight = page_pool_inflight(pool);
+	inflight = _distance(hold_cnt, release_cnt);
 
-	/* The distance should not be able to become negative */
+	trace_page_pool_inflight(pool, inflight, hold_cnt, release_cnt);
 	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
 
-	return (inflight == 0);
+	return inflight;
 }
 
 /* Cleanup page_pool state from page */
@@ -216,6 +211,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
 				   struct page *page)
 {
 	dma_addr_t dma;
+	int count;
 
 	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 		goto skip_dma_unmap;
@@ -227,9 +223,11 @@ static void __page_pool_clean_page(struct page_pool *pool,
 			     DMA_ATTR_SKIP_CPU_SYNC);
 	page->dma_addr = 0;
 skip_dma_unmap:
-	atomic_inc(&pool->pages_state_release_cnt);
-	trace_page_pool_state_release(pool, page,
-			      atomic_read(&pool->pages_state_release_cnt));
+	/* This may be the last page returned, releasing the pool, so
+	 * it is not safe to reference pool afterwards.
+	 */
+	count = atomic_inc_return(&pool->pages_state_release_cnt);
+	trace_page_pool_state_release(pool, page, count);
 }
 
 /* unmap the page and clean our state */
@@ -338,31 +336,10 @@ static void __page_pool_empty_ring(struct page_pool *pool)
 	}
 }
 
-static void __warn_in_flight(struct page_pool *pool)
+static void page_pool_free(struct page_pool *pool)
 {
-	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
-	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
-	s32 distance;
-
-	distance = _distance(hold_cnt, release_cnt);
-
-	/* Drivers should fix this, but only problematic when DMA is used */
-	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
-	     distance, hold_cnt, release_cnt);
-}
-
-void __page_pool_free(struct page_pool *pool)
-{
-	/* Only last user actually free/release resources */
-	if (!page_pool_put(pool))
-		return;
-
-	WARN(pool->alloc.count, "API usage violation");
-	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
-
-	/* Can happen due to forced shutdown */
-	if (!__page_pool_safe_to_destroy(pool))
-		__warn_in_flight(pool);
+	if (pool->disconnect)
+		pool->disconnect(pool);
 
 	ptr_ring_cleanup(&pool->ring, NULL);
 
@@ -371,12 +348,8 @@ void __page_pool_free(struct page_pool *pool)
 
 	kfree(pool);
 }
-EXPORT_SYMBOL(__page_pool_free);
 
-/* Request to shutdown: release pages cached by page_pool, and check
- * for in-flight pages
- */
-bool __page_pool_request_shutdown(struct page_pool *pool)
+static void page_pool_scrub(struct page_pool *pool)
 {
 	struct page *page;
 
@@ -393,7 +366,64 @@ bool __page_pool_request_shutdown(struct page_pool *pool)
 	 * be in-flight.
 	 */
 	__page_pool_empty_ring(pool);
+}
+
+static int page_pool_release(struct page_pool *pool)
+{
+	int inflight;
+
+	page_pool_scrub(pool);
+	inflight = page_pool_inflight(pool);
+	if (!inflight)
+		page_pool_free(pool);
+
+	return inflight;
+}
+
+static void page_pool_release_retry(struct work_struct *wq)
+{
+	struct delayed_work *dwq = to_delayed_work(wq);
+	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
+	int inflight;
+
+	inflight = page_pool_release(pool);
+	if (!inflight)
+		return;
+
+	/* Periodic warning */
+	if (time_after_eq(jiffies, pool->defer_warn)) {
+		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
+
+		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
+			__func__, inflight, sec);
+		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+	}
+
+	/* Still not ready to be disconnected, retry later */
+	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
+
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
+{
+	refcount_inc(&pool->user_cnt);
+	pool->disconnect = disconnect;
+}
+
+void page_pool_destroy(struct page_pool *pool)
+{
+	if (!pool)
+		return;
+
+	if (!page_pool_put(pool))
+		return;
+
+	if (!page_pool_release(pool))
+		return;
+
+	pool->defer_start = jiffies;
+	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
 
-	return __page_pool_safe_to_destroy(pool);
+	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
+	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
 }
-EXPORT_SYMBOL(__page_pool_request_shutdown);
+EXPORT_SYMBOL(page_pool_destroy);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 20781ad5f9c3..8e405abaf05a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -70,10 +70,6 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 
 	xa = container_of(rcu, struct xdp_mem_allocator, rcu);
 
-	/* Allocator have indicated safe to remove before this is called */
-	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
-		page_pool_free(xa->page_pool);
-
 	/* Allow this ID to be reused */
 	ida_simple_remove(&mem_id_pool, xa->mem.id);
 
@@ -85,62 +81,57 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 	kfree(xa);
 }
 
-static bool __mem_id_disconnect(int id, bool force)
+static void mem_xa_remove(struct xdp_mem_allocator *xa)
 {
-	struct xdp_mem_allocator *xa;
-	bool safe_to_remove = true;
+	trace_mem_disconnect(xa);
 
 	mutex_lock(&mem_id_lock);
 
-	xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
-	if (!xa) {
-		mutex_unlock(&mem_id_lock);
-		WARN(1, "Request remove non-existing id(%d), driver bug?", id);
-		return true;
-	}
-	xa->disconnect_cnt++;
-
-	/* Detects in-flight packet-pages for page_pool */
-	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
-		safe_to_remove = page_pool_request_shutdown(xa->page_pool);
-
-	trace_mem_disconnect(xa, safe_to_remove, force);
-
-	if ((safe_to_remove || force) &&
-	    !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+	if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
 		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
 
 	mutex_unlock(&mem_id_lock);
-	return (safe_to_remove|force);
 }
 
-#define DEFER_TIME (msecs_to_jiffies(1000))
-#define DEFER_WARN_INTERVAL (30 * HZ)
-#define DEFER_MAX_RETRIES 120
+static void mem_allocator_disconnect(void *allocator)
+{
+	struct xdp_mem_allocator *xa;
+	struct rhashtable_iter iter;
+
+	rhashtable_walk_enter(mem_id_ht, &iter);
+	do {
+		rhashtable_walk_start(&iter);
+
+		while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
+			if (xa->allocator == allocator)
+				mem_xa_remove(xa);
+		}
+
+		rhashtable_walk_stop(&iter);
 
-static void mem_id_disconnect_defer_retry(struct work_struct *wq)
+	} while (xa == ERR_PTR(-EAGAIN));
+	rhashtable_walk_exit(&iter);
+}
+
+static void mem_id_disconnect(int id)
 {
-	struct delayed_work *dwq = to_delayed_work(wq);
-	struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
-	bool force = false;
+	struct xdp_mem_allocator *xa;
 
-	if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
-		force = true;
+	mutex_lock(&mem_id_lock);
 
-	if (__mem_id_disconnect(xa->mem.id, force))
+	xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+	if (!xa) {
+		mutex_unlock(&mem_id_lock);
+		WARN(1, "Request remove non-existing id(%d), driver bug?", id);
 		return;
+	}
 
-	/* Periodic warning */
-	if (time_after_eq(jiffies, xa->defer_warn)) {
-		int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ;
+	trace_mem_disconnect(xa);
 
-		pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n",
-			__func__, xa->mem.id, xa->disconnect_cnt, sec);
-		xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
-	}
+	if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
 
-	/* Still not ready to be disconnected, retry later */
-	schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+	mutex_unlock(&mem_id_lock);
 }
 
 void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
@@ -153,38 +144,21 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 		return;
 	}
 
-	if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL &&
-	    xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) {
-		return;
-	}
-
 	if (id == 0)
 		return;
 
-	if (__mem_id_disconnect(id, false))
-		return;
-
-	/* Could not disconnect, defer new disconnect attempt to later */
-	mutex_lock(&mem_id_lock);
+	if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
+		return mem_id_disconnect(id);
 
-	xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
-	if (!xa) {
-		mutex_unlock(&mem_id_lock);
-		return;
+	if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
+		rcu_read_lock();
+		xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+		page_pool_destroy(xa->page_pool);
+		rcu_read_unlock();
 	}
-	xa->defer_start = jiffies;
-	xa->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
-
-	INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
-	mutex_unlock(&mem_id_lock);
-	schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
 
-/* This unregister operation will also cleanup and destroy the
- * allocator. The page_pool_free() operation is first called when it's
- * safe to remove, possibly deferred to a workqueue.
- */
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 {
 	/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -371,7 +345,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 	}
 
 	if (type == MEM_TYPE_PAGE_POOL)
-		page_pool_get(xdp_alloc->page_pool);
+		page_pool_use_xdp_mem(allocator, mem_allocator_disconnect);
 
 	mutex_unlock(&mem_id_lock);
 
@@ -402,15 +376,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
-		if (likely(xa)) {
-			napi_direct &= !xdp_return_frame_no_direct();
-			page_pool_put_page(xa->page_pool, page, napi_direct);
-		} else {
-			/* Hopefully stack show who to blame for late return */
-			WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
-			trace_mem_return_failed(mem, page);
-			put_page(page);
-		}
+		napi_direct &= !xdp_return_frame_no_direct();
+		page_pool_put_page(xa->page_pool, page, napi_direct);
 		rcu_read_unlock();
 		break;
 	case MEM_TYPE_PAGE_SHARED:
-- 
cgit v1.2.3-59-g8ed1b


From 1e0bd5a091e5d9e0f1d5b0e6329b87bb1792f784 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sun, 17 Nov 2019 09:28:02 -0800
Subject: bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never
 fails

92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into
potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit
(32k). Due to using 32-bit counter, it's possible in practice to overflow
refcounter and make it wrap around to 0, causing erroneous map free, while
there are still references to it, causing use-after-free problems.

But having a failing refcounting operations are problematic in some cases. One
example is mmap() interface. After establishing initial memory-mapping, user
is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily
splitting it into multiple non-contiguous regions. All this happening without
any control from the users of mmap subsystem. Rather mmap subsystem sends
notifications to original creator of memory mapping through open/close
callbacks, which are optionally specified during initial memory mapping
creation. These callbacks are used to maintain accurate refcount for bpf_map
(see next patch in this series). The problem is that open() callback is not
supposed to fail, because memory-mapped resource is set up and properly
referenced. This is posing a problem for using memory-mapping with BPF maps.

One solution to this is to maintain separate refcount for just memory-mappings
and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively.
There are similar use cases in current work on tcp-bpf, necessitating extra
counter as well. This seems like a rather unfortunate and ugly solution that
doesn't scale well to various new use cases.

Another approach to solve this is to use non-failing refcount_t type, which
uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX,
stays there. This utlimately causes memory leak, but prevents use after free.

But given refcounting is not the most performance-critical operation with BPF
maps (it's not used from running BPF program code), we can also just switch to
64-bit counter that can't overflow in practice, potentially disadvantaging
32-bit platforms a tiny bit. This simplifies semantics and allows above
described scenarios to not worry about failing refcount increment operation.

In terms of struct bpf_map size, we are still good and use the same amount of
space:

BEFORE (3 cache lines, 8 bytes of padding at the end):
struct bpf_map {
	const struct bpf_map_ops  * ops __attribute__((__aligned__(64))); /*     0     8 */
	struct bpf_map *           inner_map_meta;       /*     8     8 */
	void *                     security;             /*    16     8 */
	enum bpf_map_type  map_type;                     /*    24     4 */
	u32                        key_size;             /*    28     4 */
	u32                        value_size;           /*    32     4 */
	u32                        max_entries;          /*    36     4 */
	u32                        map_flags;            /*    40     4 */
	int                        spin_lock_off;        /*    44     4 */
	u32                        id;                   /*    48     4 */
	int                        numa_node;            /*    52     4 */
	u32                        btf_key_type_id;      /*    56     4 */
	u32                        btf_value_type_id;    /*    60     4 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct btf *               btf;                  /*    64     8 */
	struct bpf_map_memory memory;                    /*    72    16 */
	bool                       unpriv_array;         /*    88     1 */
	bool                       frozen;               /*    89     1 */

	/* XXX 38 bytes hole, try to pack */

	/* --- cacheline 2 boundary (128 bytes) --- */
	atomic_t                   refcnt __attribute__((__aligned__(64))); /*   128     4 */
	atomic_t                   usercnt;              /*   132     4 */
	struct work_struct work;                         /*   136    32 */
	char                       name[16];             /*   168    16 */

	/* size: 192, cachelines: 3, members: 21 */
	/* sum members: 146, holes: 1, sum holes: 38 */
	/* padding: 8 */
	/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
} __attribute__((__aligned__(64)));

AFTER (same 3 cache lines, no extra padding now):
struct bpf_map {
	const struct bpf_map_ops  * ops __attribute__((__aligned__(64))); /*     0     8 */
	struct bpf_map *           inner_map_meta;       /*     8     8 */
	void *                     security;             /*    16     8 */
	enum bpf_map_type  map_type;                     /*    24     4 */
	u32                        key_size;             /*    28     4 */
	u32                        value_size;           /*    32     4 */
	u32                        max_entries;          /*    36     4 */
	u32                        map_flags;            /*    40     4 */
	int                        spin_lock_off;        /*    44     4 */
	u32                        id;                   /*    48     4 */
	int                        numa_node;            /*    52     4 */
	u32                        btf_key_type_id;      /*    56     4 */
	u32                        btf_value_type_id;    /*    60     4 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct btf *               btf;                  /*    64     8 */
	struct bpf_map_memory memory;                    /*    72    16 */
	bool                       unpriv_array;         /*    88     1 */
	bool                       frozen;               /*    89     1 */

	/* XXX 38 bytes hole, try to pack */

	/* --- cacheline 2 boundary (128 bytes) --- */
	atomic64_t                 refcnt __attribute__((__aligned__(64))); /*   128     8 */
	atomic64_t                 usercnt;              /*   136     8 */
	struct work_struct work;                         /*   144    32 */
	char                       name[16];             /*   176    16 */

	/* size: 192, cachelines: 3, members: 21 */
	/* sum members: 154, holes: 1, sum holes: 38 */
	/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
} __attribute__((__aligned__(64)));

This patch, while modifying all users of bpf_map_inc, also cleans up its
interface to match bpf_map_put with separate operations for bpf_map_inc and
bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref,
respectively). Also, given there are no users of bpf_map_inc_not_zero
specifying uref=true, remove uref flag and default to uref=false internally.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c |  4 +-
 include/linux/bpf.h                              | 10 ++---
 kernel/bpf/inode.c                               |  2 +-
 kernel/bpf/map_in_map.c                          |  2 +-
 kernel/bpf/syscall.c                             | 51 ++++++++++--------------
 kernel/bpf/verifier.c                            |  6 +--
 kernel/bpf/xskmap.c                              |  6 +--
 net/core/bpf_sk_storage.c                        |  2 +-
 8 files changed, 34 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 88fab6a82acf..06927ba5a3ae 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -46,9 +46,7 @@ nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog,
 	/* Grab a single ref to the map for our record.  The prog destroy ndo
 	 * happens after free_used_maps().
 	 */
-	map = bpf_map_inc(map, false);
-	if (IS_ERR(map))
-		return PTR_ERR(map);
+	bpf_map_inc(map);
 
 	record = kmalloc(sizeof(*record), GFP_KERNEL);
 	if (!record) {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b81cde47314..34a34445c009 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -103,8 +103,8 @@ struct bpf_map {
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
-	atomic_t refcnt ____cacheline_aligned;
-	atomic_t usercnt;
+	atomic64_t refcnt ____cacheline_aligned;
+	atomic64_t usercnt;
 	struct work_struct work;
 	char name[BPF_OBJ_NAME_LEN];
 };
@@ -783,9 +783,9 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
-struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
-struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map,
-						   bool uref);
+void bpf_map_inc(struct bpf_map *map);
+void bpf_map_inc_with_uref(struct bpf_map *map);
+struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index a70f7209cda3..2f17f24258dc 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -34,7 +34,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
 		raw = bpf_prog_inc(raw);
 		break;
 	case BPF_TYPE_MAP:
-		raw = bpf_map_inc(raw, true);
+		bpf_map_inc_with_uref(raw);
 		break;
 	default:
 		WARN_ON_ONCE(1);
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index fab4fb134547..4cbe987be35b 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -98,7 +98,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map,
 		return inner_map;
 
 	if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
-		inner_map = bpf_map_inc(inner_map, false);
+		bpf_map_inc(inner_map);
 	else
 		inner_map = ERR_PTR(-EINVAL);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c88c815c2154..20030751b7a2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -311,7 +311,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 static void bpf_map_put_uref(struct bpf_map *map)
 {
-	if (atomic_dec_and_test(&map->usercnt)) {
+	if (atomic64_dec_and_test(&map->usercnt)) {
 		if (map->ops->map_release_uref)
 			map->ops->map_release_uref(map);
 	}
@@ -322,7 +322,7 @@ static void bpf_map_put_uref(struct bpf_map *map)
  */
 static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
 {
-	if (atomic_dec_and_test(&map->refcnt)) {
+	if (atomic64_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map, do_idr_lock);
 		btf_put(map->btf);
@@ -575,8 +575,8 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map;
 
-	atomic_set(&map->refcnt, 1);
-	atomic_set(&map->usercnt, 1);
+	atomic64_set(&map->refcnt, 1);
+	atomic64_set(&map->usercnt, 1);
 
 	if (attr->btf_key_type_id || attr->btf_value_type_id) {
 		struct btf *btf;
@@ -653,21 +653,19 @@ struct bpf_map *__bpf_map_get(struct fd f)
 	return f.file->private_data;
 }
 
-/* prog's and map's refcnt limit */
-#define BPF_MAX_REFCNT 32768
-
-struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
+void bpf_map_inc(struct bpf_map *map)
 {
-	if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
-		atomic_dec(&map->refcnt);
-		return ERR_PTR(-EBUSY);
-	}
-	if (uref)
-		atomic_inc(&map->usercnt);
-	return map;
+	atomic64_inc(&map->refcnt);
 }
 EXPORT_SYMBOL_GPL(bpf_map_inc);
 
+void bpf_map_inc_with_uref(struct bpf_map *map)
+{
+	atomic64_inc(&map->refcnt);
+	atomic64_inc(&map->usercnt);
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
+
 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 {
 	struct fd f = fdget(ufd);
@@ -677,38 +675,30 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	if (IS_ERR(map))
 		return map;
 
-	map = bpf_map_inc(map, true);
+	bpf_map_inc_with_uref(map);
 	fdput(f);
 
 	return map;
 }
 
 /* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map,
-					      bool uref)
+static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
 {
 	int refold;
 
-	refold = atomic_fetch_add_unless(&map->refcnt, 1, 0);
-
-	if (refold >= BPF_MAX_REFCNT) {
-		__bpf_map_put(map, false);
-		return ERR_PTR(-EBUSY);
-	}
-
+	refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0);
 	if (!refold)
 		return ERR_PTR(-ENOENT);
-
 	if (uref)
-		atomic_inc(&map->usercnt);
+		atomic64_inc(&map->usercnt);
 
 	return map;
 }
 
-struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
 {
 	spin_lock_bh(&map_idr_lock);
-	map = __bpf_map_inc_not_zero(map, uref);
+	map = __bpf_map_inc_not_zero(map, false);
 	spin_unlock_bh(&map_idr_lock);
 
 	return map;
@@ -1455,6 +1445,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f)
 	return f.file->private_data;
 }
 
+/* prog's refcnt limit */
+#define BPF_MAX_REFCNT 32768
+
 struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 {
 	if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e9dc95a18d44..9f59f7a19dd0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8179,11 +8179,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			 * will be used by the valid program until it's unloaded
 			 * and all maps are released in free_used_maps()
 			 */
-			map = bpf_map_inc(map, false);
-			if (IS_ERR(map)) {
-				fdput(f);
-				return PTR_ERR(map);
-			}
+			bpf_map_inc(map);
 
 			aux->map_index = env->used_map_cnt;
 			env->used_maps[env->used_map_cnt++] = map;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index da16c30868f3..90c4fce1c981 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -11,10 +11,8 @@
 
 int xsk_map_inc(struct xsk_map *map)
 {
-	struct bpf_map *m = &map->map;
-
-	m = bpf_map_inc(m, false);
-	return PTR_ERR_OR_ZERO(m);
+	bpf_map_inc(&map->map);
+	return 0;
 }
 
 void xsk_map_put(struct xsk_map *map)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index da5639a5bd3b..458be6b3eda9 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -798,7 +798,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 		 * Try to grab map refcnt to make sure that it's still
 		 * alive and prevent concurrent removal.
 		 */
-		map = bpf_map_inc_not_zero(&smap->map, false);
+		map = bpf_map_inc_not_zero(&smap->map);
 		if (IS_ERR(map))
 			continue;
 
-- 
cgit v1.2.3-59-g8ed1b


From 85192dbf4de08795afe2b88e52a36fc6abfc3dba Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sun, 17 Nov 2019 09:28:03 -0800
Subject: bpf: Convert bpf_prog refcnt to atomic64_t

Similarly to bpf_map's refcnt/usercnt, convert bpf_prog's refcnt to atomic64
and remove artificial 32k limit. This allows to make bpf_prog's refcounting
non-failing, simplifying logic of users of bpf_prog_add/bpf_prog_inc.

Validated compilation by running allyesconfig kernel build.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191117172806.2195367-3-andriin@fb.com
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |  9 ++-----
 drivers/net/ethernet/cavium/thunder/nicvf_main.c  |  9 ++-----
 drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c  |  7 ++----
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    | 24 +++++-------------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 18 ++++----------
 drivers/net/ethernet/qlogic/qede/qede_main.c      |  8 ++----
 drivers/net/virtio_net.c                          |  7 ++----
 include/linux/bpf.h                               | 13 ++++------
 kernel/bpf/inode.c                                |  5 ++--
 kernel/bpf/syscall.c                              | 30 +++++++----------------
 kernel/events/core.c                              |  7 ++----
 11 files changed, 40 insertions(+), 97 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index c07172429c70..9da4fbee3cf7 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -3171,13 +3171,8 @@ static int bnxt_init_one_rx_ring(struct bnxt *bp, int ring_nr)
 	bnxt_init_rxbd_pages(ring, type);
 
 	if (BNXT_RX_PAGE_MODE(bp) && bp->xdp_prog) {
-		rxr->xdp_prog = bpf_prog_add(bp->xdp_prog, 1);
-		if (IS_ERR(rxr->xdp_prog)) {
-			int rc = PTR_ERR(rxr->xdp_prog);
-
-			rxr->xdp_prog = NULL;
-			return rc;
-		}
+		bpf_prog_add(bp->xdp_prog, 1);
+		rxr->xdp_prog = bp->xdp_prog;
 	}
 	prod = rxr->rx_prod;
 	for (i = 0; i < bp->rx_ring_size; i++) {
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 40a44dcb3d9b..f28409279ea4 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1876,13 +1876,8 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog)
 
 	if (nic->xdp_prog) {
 		/* Attach BPF program */
-		nic->xdp_prog = bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1);
-		if (!IS_ERR(nic->xdp_prog)) {
-			bpf_attached = true;
-		} else {
-			ret = PTR_ERR(nic->xdp_prog);
-			nic->xdp_prog = NULL;
-		}
+		bpf_prog_add(nic->xdp_prog, nic->rx_queues - 1);
+		bpf_attached = true;
 	}
 
 	/* Calculate Tx queues needed for XDP and network stack */
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index c26c0a7cbb6b..acc56606d3a5 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -1807,11 +1807,8 @@ static int setup_xdp(struct net_device *dev, struct bpf_prog *prog)
 	if (prog && !xdp_mtu_valid(priv, dev->mtu))
 		return -EINVAL;
 
-	if (prog) {
-		prog = bpf_prog_add(prog, priv->num_channels);
-		if (IS_ERR(prog))
-			return PTR_ERR(prog);
-	}
+	if (prog)
+		bpf_prog_add(prog, priv->num_channels);
 
 	up = netif_running(dev);
 	need_update = (!!priv->xdp_prog != !!prog);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 40ec5acf79c0..d4697beeacc2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2286,11 +2286,7 @@ int mlx4_en_try_alloc_resources(struct mlx4_en_priv *priv,
 		lockdep_is_held(&priv->mdev->state_lock));
 
 	if (xdp_prog && carry_xdp_prog) {
-		xdp_prog = bpf_prog_add(xdp_prog, tmp->rx_ring_num);
-		if (IS_ERR(xdp_prog)) {
-			mlx4_en_free_resources(tmp);
-			return PTR_ERR(xdp_prog);
-		}
+		bpf_prog_add(xdp_prog, tmp->rx_ring_num);
 		for (i = 0; i < tmp->rx_ring_num; i++)
 			rcu_assign_pointer(tmp->rx_ring[i]->xdp_prog,
 					   xdp_prog);
@@ -2782,11 +2778,9 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 	 * program for a new one.
 	 */
 	if (priv->tx_ring_num[TX_XDP] == xdp_ring_num) {
-		if (prog) {
-			prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
-			if (IS_ERR(prog))
-				return PTR_ERR(prog);
-		}
+		if (prog)
+			bpf_prog_add(prog, priv->rx_ring_num - 1);
+
 		mutex_lock(&mdev->state_lock);
 		for (i = 0; i < priv->rx_ring_num; i++) {
 			old_prog = rcu_dereference_protected(
@@ -2807,13 +2801,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 	if (!tmp)
 		return -ENOMEM;
 
-	if (prog) {
-		prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
-		if (IS_ERR(prog)) {
-			err = PTR_ERR(prog);
-			goto out;
-		}
-	}
+	if (prog)
+		bpf_prog_add(prog, priv->rx_ring_num - 1);
 
 	mutex_lock(&mdev->state_lock);
 	memcpy(&new_prof, priv->prof, sizeof(struct mlx4_en_port_profile));
@@ -2862,7 +2851,6 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 
 unlock_out:
 	mutex_unlock(&mdev->state_lock);
-out:
 	kfree(tmp);
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 772bfdbdeb9c..1d4a66fb466a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -408,12 +408,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->stats = &c->priv->channel_stats[c->ix].rq;
 	INIT_WORK(&rq->recover_work, mlx5e_rq_err_cqe_work);
 
-	rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
-	if (IS_ERR(rq->xdp_prog)) {
-		err = PTR_ERR(rq->xdp_prog);
-		rq->xdp_prog = NULL;
-		goto err_rq_wq_destroy;
-	}
+	if (params->xdp_prog)
+		bpf_prog_inc(params->xdp_prog);
+	rq->xdp_prog = params->xdp_prog;
 
 	rq_xdp_ix = rq->ix;
 	if (xsk)
@@ -4406,16 +4403,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
 	/* no need for full reset when exchanging programs */
 	reset = (!priv->channels.params.xdp_prog || !prog);
 
-	if (was_opened && !reset) {
+	if (was_opened && !reset)
 		/* num_channels is invariant here, so we can take the
 		 * batched reference right upfront.
 		 */
-		prog = bpf_prog_add(prog, priv->channels.num);
-		if (IS_ERR(prog)) {
-			err = PTR_ERR(prog);
-			goto unlock;
-		}
-	}
+		bpf_prog_add(prog, priv->channels.num);
 
 	if (was_opened && reset) {
 		struct mlx5e_channels new_channels = {};
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 8d1c208f778f..1e26964fe4e9 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2107,12 +2107,8 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			if (rc)
 				goto out;
 
-			fp->rxq->xdp_prog = bpf_prog_add(edev->xdp_prog, 1);
-			if (IS_ERR(fp->rxq->xdp_prog)) {
-				rc = PTR_ERR(fp->rxq->xdp_prog);
-				fp->rxq->xdp_prog = NULL;
-				goto out;
-			}
+			bpf_prog_add(edev->xdp_prog, 1);
+			fp->rxq->xdp_prog = edev->xdp_prog;
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 5a635f028bdc..4d7d5434cc5d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2445,11 +2445,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	if (!prog && !old_prog)
 		return 0;
 
-	if (prog) {
-		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
-		if (IS_ERR(prog))
-			return PTR_ERR(prog);
-	}
+	if (prog)
+		bpf_prog_add(prog, vi->max_queue_pairs - 1);
 
 	/* Make sure NAPI is not using any XDP TX queues for RX. */
 	if (netif_running(dev)) {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 34a34445c009..fb606dc61a3a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -485,7 +485,7 @@ struct bpf_func_info_aux {
 };
 
 struct bpf_prog_aux {
-	atomic_t refcnt;
+	atomic64_t refcnt;
 	u32 used_map_cnt;
 	u32 max_ctx_offset;
 	u32 max_pkt_offset;
@@ -770,9 +770,9 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 				       bool attach_drv);
-struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
+void bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
-struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
+void bpf_prog_inc(struct bpf_prog *prog);
 struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 int __bpf_prog_charge(struct user_struct *user, u32 pages);
@@ -912,10 +912,8 @@ static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
-							  int i)
+static inline void bpf_prog_add(struct bpf_prog *prog, int i)
 {
-	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
@@ -926,9 +924,8 @@ static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
 
-static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog)
+static inline void bpf_prog_inc(struct bpf_prog *prog)
 {
-	return ERR_PTR(-EOPNOTSUPP);
 }
 
 static inline struct bpf_prog *__must_check
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 2f17f24258dc..ecf42bec38c0 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -31,7 +31,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
 {
 	switch (type) {
 	case BPF_TYPE_PROG:
-		raw = bpf_prog_inc(raw);
+		bpf_prog_inc(raw);
 		break;
 	case BPF_TYPE_MAP:
 		bpf_map_inc_with_uref(raw);
@@ -534,7 +534,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type
 	if (!bpf_prog_get_ok(prog, &type, false))
 		return ERR_PTR(-EINVAL);
 
-	return bpf_prog_inc(prog);
+	bpf_prog_inc(prog);
+	return prog;
 }
 
 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 20030751b7a2..52fe4bacb330 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1339,7 +1339,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 
 static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
-	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+	if (atomic64_dec_and_test(&prog->aux->refcnt)) {
 		perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
@@ -1445,16 +1445,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f)
 	return f.file->private_data;
 }
 
-/* prog's refcnt limit */
-#define BPF_MAX_REFCNT 32768
-
-struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
+void bpf_prog_add(struct bpf_prog *prog, int i)
 {
-	if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
-		atomic_sub(i, &prog->aux->refcnt);
-		return ERR_PTR(-EBUSY);
-	}
-	return prog;
+	atomic64_add(i, &prog->aux->refcnt);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_add);
 
@@ -1465,13 +1458,13 @@ void bpf_prog_sub(struct bpf_prog *prog, int i)
 	 * path holds a reference to the program, thus atomic_sub() can
 	 * be safely used in such cases!
 	 */
-	WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0);
+	WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_sub);
 
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+void bpf_prog_inc(struct bpf_prog *prog)
 {
-	return bpf_prog_add(prog, 1);
+	atomic64_inc(&prog->aux->refcnt);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc);
 
@@ -1480,12 +1473,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 {
 	int refold;
 
-	refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0);
-
-	if (refold >= BPF_MAX_REFCNT) {
-		__bpf_prog_put(prog, false);
-		return ERR_PTR(-EBUSY);
-	}
+	refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0);
 
 	if (!refold)
 		return ERR_PTR(-ENOENT);
@@ -1523,7 +1511,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
 		goto out;
 	}
 
-	prog = bpf_prog_inc(prog);
+	bpf_prog_inc(prog);
 out:
 	fdput(f);
 	return prog;
@@ -1714,7 +1702,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	prog->orig_prog = NULL;
 	prog->jited = 0;
 
-	atomic_set(&prog->aux->refcnt, 1);
+	atomic64_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
 	if (bpf_prog_is_dev_bound(prog->aux)) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aec8dba2bea4..73c616876597 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10477,12 +10477,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		context = parent_event->overflow_handler_context;
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
 		if (overflow_handler == bpf_overflow_handler) {
-			struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
+			struct bpf_prog *prog = parent_event->prog;
 
-			if (IS_ERR(prog)) {
-				err = PTR_ERR(prog);
-				goto err_ns;
-			}
+			bpf_prog_inc(prog);
 			event->prog = prog;
 			event->orig_overflow_handler =
 				parent_event->orig_overflow_handler;
-- 
cgit v1.2.3-59-g8ed1b


From fc9702273e2edb90400a34b3be76f7b08fa3344b Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sun, 17 Nov 2019 09:28:04 -0800
Subject: bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY

Add ability to memory-map contents of BPF array map. This is extremely useful
for working with BPF global data from userspace programs. It allows to avoid
typical bpf_map_{lookup,update}_elem operations, improving both performance
and usability.

There had to be special considerations for map freezing, to avoid having
writable memory view into a frozen map. To solve this issue, map freezing and
mmap-ing is happening under mutex now:
  - if map is already frozen, no writable mapping is allowed;
  - if map has writable memory mappings active (accounted in map->writecnt),
    map freezing will keep failing with -EBUSY;
  - once number of writable memory mappings drops to zero, map freezing can be
    performed again.

Only non-per-CPU plain arrays are supported right now. Maps with spinlocks
can't be memory mapped either.

For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc()
to be mmap()'able. We also need to make sure that array data memory is
page-sized and page-aligned, so we over-allocate memory in such a way that
struct bpf_array is at the end of a single page of memory with array->value
being aligned with the start of the second page. On deallocation we need to
accomodate this memory arrangement to free vmalloc()'ed memory correctly.

One important consideration regarding how memory-mapping subsystem functions.
Memory-mapping subsystem provides few optional callbacks, among them open()
and close().  close() is called for each memory region that is unmapped, so
that users can decrease their reference counters and free up resources, if
necessary. open() is *almost* symmetrical: it's called for each memory region
that is being mapped, **except** the very first one. So bpf_map_mmap does
initial refcnt bump, while open() will do any extra ones after that. Thus
number of close() calls is equal to number of open() calls plus one more.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com
---
 include/linux/bpf.h            | 11 +++--
 include/linux/vmalloc.h        |  1 +
 include/uapi/linux/bpf.h       |  3 ++
 kernel/bpf/arraymap.c          | 58 ++++++++++++++++++++++---
 kernel/bpf/syscall.c           | 99 ++++++++++++++++++++++++++++++++++++++++--
 mm/vmalloc.c                   | 20 +++++++++
 tools/include/uapi/linux/bpf.h |  3 ++
 7 files changed, 183 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fb606dc61a3a..e913dd5946ae 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -12,6 +12,7 @@
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
+#include <linux/mm_types.h>
 #include <linux/wait.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/refcount.h>
@@ -68,6 +69,7 @@ struct bpf_map_ops {
 				     u64 *imm, u32 off);
 	int (*map_direct_value_meta)(const struct bpf_map *map,
 				     u64 imm, u32 *off);
+	int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
 };
 
 struct bpf_map_memory {
@@ -96,9 +98,10 @@ struct bpf_map {
 	u32 btf_value_type_id;
 	struct btf *btf;
 	struct bpf_map_memory memory;
+	char name[BPF_OBJ_NAME_LEN];
 	bool unpriv_array;
-	bool frozen; /* write-once */
-	/* 48 bytes hole */
+	bool frozen; /* write-once; write-protected by freeze_mutex */
+	/* 22 bytes hole */
 
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
@@ -106,7 +109,8 @@ struct bpf_map {
 	atomic64_t refcnt ____cacheline_aligned;
 	atomic64_t usercnt;
 	struct work_struct work;
-	char name[BPF_OBJ_NAME_LEN];
+	struct mutex freeze_mutex;
+	u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
 };
 
 static inline bool map_value_has_spin_lock(const struct bpf_map *map)
@@ -795,6 +799,7 @@ void bpf_map_charge_finish(struct bpf_map_memory *mem);
 void bpf_map_charge_move(struct bpf_map_memory *dst,
 			 struct bpf_map_memory *src);
 void *bpf_map_area_alloc(size_t size, int numa_node);
+void *bpf_map_area_mmapable_alloc(size_t size, int numa_node);
 void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 4e7809408073..b4c58a191eb1 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -93,6 +93,7 @@ extern void *vzalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
+extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *vmalloc_32_user(unsigned long size);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4842a134b202..dbbcf0b02970 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -348,6 +348,9 @@ enum bpf_attach_type {
 /* Clone map from listener for newly accepted socket */
 #define BPF_F_CLONE		(1U << 9)
 
+/* Enable memory-mapping BPF map */
+#define BPF_F_MMAPABLE		(1U << 10)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..a42097c36b0c 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,7 +14,7 @@
 #include "map_in_map.h"
 
 #define ARRAY_CREATE_FLAG_MASK \
-	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
+	(BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK)
 
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
@@ -59,6 +59,10 @@ int array_map_alloc_check(union bpf_attr *attr)
 	    (percpu && numa_node != NUMA_NO_NODE))
 		return -EINVAL;
 
+	if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
+	    attr->map_flags & BPF_F_MMAPABLE)
+		return -EINVAL;
+
 	if (attr->value_size > KMALLOC_MAX_SIZE)
 		/* if value_size is bigger, the user space won't be able to
 		 * access the elements.
@@ -102,10 +106,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	}
 
 	array_size = sizeof(*array);
-	if (percpu)
+	if (percpu) {
 		array_size += (u64) max_entries * sizeof(void *);
-	else
-		array_size += (u64) max_entries * elem_size;
+	} else {
+		/* rely on vmalloc() to return page-aligned memory and
+		 * ensure array->value is exactly page-aligned
+		 */
+		if (attr->map_flags & BPF_F_MMAPABLE) {
+			array_size = PAGE_ALIGN(array_size);
+			array_size += PAGE_ALIGN((u64) max_entries * elem_size);
+		} else {
+			array_size += (u64) max_entries * elem_size;
+		}
+	}
 
 	/* make sure there is no u32 overflow later in round_up() */
 	cost = array_size;
@@ -117,7 +130,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(ret);
 
 	/* allocate all map elements and zero-initialize them */
-	array = bpf_map_area_alloc(array_size, numa_node);
+	if (attr->map_flags & BPF_F_MMAPABLE) {
+		void *data;
+
+		/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
+		data = bpf_map_area_mmapable_alloc(array_size, numa_node);
+		if (!data) {
+			bpf_map_charge_finish(&mem);
+			return ERR_PTR(-ENOMEM);
+		}
+		array = data + PAGE_ALIGN(sizeof(struct bpf_array))
+			- offsetof(struct bpf_array, value);
+	} else {
+		array = bpf_map_area_alloc(array_size, numa_node);
+	}
 	if (!array) {
 		bpf_map_charge_finish(&mem);
 		return ERR_PTR(-ENOMEM);
@@ -350,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key)
 	return -EINVAL;
 }
 
+static void *array_map_vmalloc_addr(struct bpf_array *array)
+{
+	return (void *)round_down((unsigned long)array, PAGE_SIZE);
+}
+
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 static void array_map_free(struct bpf_map *map)
 {
@@ -365,7 +396,10 @@ static void array_map_free(struct bpf_map *map)
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		bpf_array_free_percpu(array);
 
-	bpf_map_area_free(array);
+	if (array->map.map_flags & BPF_F_MMAPABLE)
+		bpf_map_area_free(array_map_vmalloc_addr(array));
+	else
+		bpf_map_area_free(array);
 }
 
 static void array_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -444,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map,
 	return 0;
 }
 
+int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;
+
+	if (!(map->map_flags & BPF_F_MMAPABLE))
+		return -EINVAL;
+
+	return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff);
+}
+
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -455,6 +500,7 @@ const struct bpf_map_ops array_map_ops = {
 	.map_gen_lookup = array_map_gen_lookup,
 	.map_direct_value_addr = array_map_direct_value_addr,
 	.map_direct_value_meta = array_map_direct_value_meta,
+	.map_mmap = array_map_mmap,
 	.map_seq_show_elem = array_map_seq_show_elem,
 	.map_check_btf = array_map_check_btf,
 };
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 52fe4bacb330..bac3becf9f90 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -127,7 +127,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }
 
-void *bpf_map_area_alloc(size_t size, int numa_node)
+static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable)
 {
 	/* We really just want to fail instead of triggering OOM killer
 	 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
@@ -142,18 +142,33 @@ void *bpf_map_area_alloc(size_t size, int numa_node)
 	const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
 	void *area;
 
-	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+	/* kmalloc()'ed memory can't be mmap()'ed */
+	if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
 		area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
 				    numa_node);
 		if (area != NULL)
 			return area;
 	}
-
+	if (mmapable) {
+		BUG_ON(!PAGE_ALIGNED(size));
+		return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
+					       __GFP_RETRY_MAYFAIL | flags);
+	}
 	return __vmalloc_node_flags_caller(size, numa_node,
 					   GFP_KERNEL | __GFP_RETRY_MAYFAIL |
 					   flags, __builtin_return_address(0));
 }
 
+void *bpf_map_area_alloc(size_t size, int numa_node)
+{
+	return __bpf_map_area_alloc(size, numa_node, false);
+}
+
+void *bpf_map_area_mmapable_alloc(size_t size, int numa_node)
+{
+	return __bpf_map_area_alloc(size, numa_node, true);
+}
+
 void bpf_map_area_free(void *area)
 {
 	kvfree(area);
@@ -425,6 +440,74 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
 	return -EINVAL;
 }
 
+/* called for any extra memory-mapped regions (except initial) */
+static void bpf_map_mmap_open(struct vm_area_struct *vma)
+{
+	struct bpf_map *map = vma->vm_file->private_data;
+
+	bpf_map_inc_with_uref(map);
+
+	if (vma->vm_flags & VM_WRITE) {
+		mutex_lock(&map->freeze_mutex);
+		map->writecnt++;
+		mutex_unlock(&map->freeze_mutex);
+	}
+}
+
+/* called for all unmapped memory region (including initial) */
+static void bpf_map_mmap_close(struct vm_area_struct *vma)
+{
+	struct bpf_map *map = vma->vm_file->private_data;
+
+	if (vma->vm_flags & VM_WRITE) {
+		mutex_lock(&map->freeze_mutex);
+		map->writecnt--;
+		mutex_unlock(&map->freeze_mutex);
+	}
+
+	bpf_map_put_with_uref(map);
+}
+
+static const struct vm_operations_struct bpf_map_default_vmops = {
+	.open		= bpf_map_mmap_open,
+	.close		= bpf_map_mmap_close,
+};
+
+static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct bpf_map *map = filp->private_data;
+	int err;
+
+	if (!map->ops->map_mmap || map_value_has_spin_lock(map))
+		return -ENOTSUPP;
+
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+
+	mutex_lock(&map->freeze_mutex);
+
+	if ((vma->vm_flags & VM_WRITE) && map->frozen) {
+		err = -EPERM;
+		goto out;
+	}
+
+	/* set default open/close callbacks */
+	vma->vm_ops = &bpf_map_default_vmops;
+	vma->vm_private_data = map;
+
+	err = map->ops->map_mmap(map, vma);
+	if (err)
+		goto out;
+
+	bpf_map_inc_with_uref(map);
+
+	if (vma->vm_flags & VM_WRITE)
+		map->writecnt++;
+out:
+	mutex_unlock(&map->freeze_mutex);
+	return err;
+}
+
 const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
@@ -432,6 +515,7 @@ const struct file_operations bpf_map_fops = {
 	.release	= bpf_map_release,
 	.read		= bpf_dummy_read,
 	.write		= bpf_dummy_write,
+	.mmap		= bpf_map_mmap,
 };
 
 int bpf_map_new_fd(struct bpf_map *map, int flags)
@@ -577,6 +661,7 @@ static int map_create(union bpf_attr *attr)
 
 	atomic64_set(&map->refcnt, 1);
 	atomic64_set(&map->usercnt, 1);
+	mutex_init(&map->freeze_mutex);
 
 	if (attr->btf_key_type_id || attr->btf_value_type_id) {
 		struct btf *btf;
@@ -1163,6 +1248,13 @@ static int map_freeze(const union bpf_attr *attr)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
+
+	mutex_lock(&map->freeze_mutex);
+
+	if (map->writecnt) {
+		err = -EBUSY;
+		goto err_put;
+	}
 	if (READ_ONCE(map->frozen)) {
 		err = -EBUSY;
 		goto err_put;
@@ -1174,6 +1266,7 @@ static int map_freeze(const union bpf_attr *attr)
 
 	WRITE_ONCE(map->frozen, true);
 err_put:
+	mutex_unlock(&map->freeze_mutex);
 	fdput(f);
 	return err;
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a3c70e275f4e..4a7d7459c4f9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2671,6 +2671,26 @@ void *vzalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vzalloc_node);
 
+/**
+ * vmalloc_user_node_flags - allocate memory for userspace on a specific node
+ * @size: allocation size
+ * @node: numa node
+ * @flags: flags for the page level allocator
+ *
+ * The resulting memory area is zeroed so it can be mapped to userspace
+ * without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
+{
+	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
+				    flags | __GFP_ZERO, PAGE_KERNEL,
+				    VM_USERMAP, node,
+				    __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_user_node_flags);
+
 /**
  * vmalloc_exec - allocate virtually contiguous, executable memory
  * @size:	  allocation size
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4842a134b202..dbbcf0b02970 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -348,6 +348,9 @@ enum bpf_attach_type {
 /* Clone map from listener for newly accepted socket */
 #define BPF_F_CLONE		(1U << 9)
 
+/* Enable memory-mapping BPF map */
+#define BPF_F_MMAPABLE		(1U << 10)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
-- 
cgit v1.2.3-59-g8ed1b


From 298e54fa810e027f1b0800d789eb862592721f08 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 15 Nov 2019 19:56:51 +0000
Subject: net: phy: add core phylib sfp support

Add core phylib help for supporting SFP sockets on PHYs.  This provides
a mechanism to inform the SFP layer about PHY up/down events, and also
unregister the SFP bus when the PHY is going away.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        |  7 +++++
 drivers/net/phy/phy_device.c | 66 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h          | 11 ++++++++
 3 files changed, 84 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 105d389b58e7..36d4ffe1cd3f 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -23,6 +23,7 @@
 #include <linux/ethtool.h>
 #include <linux/phy.h>
 #include <linux/phy_led_triggers.h>
+#include <linux/sfp.h>
 #include <linux/workqueue.h>
 #include <linux/mdio.h>
 #include <linux/io.h>
@@ -841,6 +842,9 @@ void phy_stop(struct phy_device *phydev)
 
 	mutex_lock(&phydev->lock);
 
+	if (phydev->sfp_bus)
+		sfp_upstream_stop(phydev->sfp_bus);
+
 	phydev->state = PHY_HALTED;
 
 	mutex_unlock(&phydev->lock);
@@ -875,6 +879,9 @@ void phy_start(struct phy_device *phydev)
 		goto out;
 	}
 
+	if (phydev->sfp_bus)
+		sfp_upstream_start(phydev->sfp_bus);
+
 	/* if phy was suspended, bring the physical link up again */
 	__phy_resume(phydev);
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index fa71998fea51..40f1942f1606 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -27,6 +27,7 @@
 #include <linux/bitmap.h>
 #include <linux/phy.h>
 #include <linux/phy_led_triggers.h>
+#include <linux/sfp.h>
 #include <linux/mdio.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
@@ -1174,6 +1175,65 @@ phy_standalone_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(phy_standalone);
 
+/**
+ * phy_sfp_attach - attach the SFP bus to the PHY upstream network device
+ * @upstream: pointer to the phy device
+ * @bus: sfp bus representing cage being attached
+ *
+ * This is used to fill in the sfp_upstream_ops .attach member.
+ */
+void phy_sfp_attach(void *upstream, struct sfp_bus *bus)
+{
+	struct phy_device *phydev = upstream;
+
+	if (phydev->attached_dev)
+		phydev->attached_dev->sfp_bus = bus;
+	phydev->sfp_bus_attached = true;
+}
+EXPORT_SYMBOL(phy_sfp_attach);
+
+/**
+ * phy_sfp_detach - detach the SFP bus from the PHY upstream network device
+ * @upstream: pointer to the phy device
+ * @bus: sfp bus representing cage being attached
+ *
+ * This is used to fill in the sfp_upstream_ops .detach member.
+ */
+void phy_sfp_detach(void *upstream, struct sfp_bus *bus)
+{
+	struct phy_device *phydev = upstream;
+
+	if (phydev->attached_dev)
+		phydev->attached_dev->sfp_bus = NULL;
+	phydev->sfp_bus_attached = false;
+}
+EXPORT_SYMBOL(phy_sfp_detach);
+
+/**
+ * phy_sfp_probe - probe for a SFP cage attached to this PHY device
+ * @phydev: Pointer to phy_device
+ * @ops: SFP's upstream operations
+ */
+int phy_sfp_probe(struct phy_device *phydev,
+		  const struct sfp_upstream_ops *ops)
+{
+	struct sfp_bus *bus;
+	int ret;
+
+	if (phydev->mdio.dev.fwnode) {
+		bus = sfp_bus_find_fwnode(phydev->mdio.dev.fwnode);
+		if (IS_ERR(bus))
+			return PTR_ERR(bus);
+
+		phydev->sfp_bus = bus;
+
+		ret = sfp_bus_add_upstream(bus, phydev, ops);
+		sfp_bus_put(bus);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(phy_sfp_probe);
+
 /**
  * phy_attach_direct - attach a network device to a given PHY device pointer
  * @dev: network device to attach
@@ -1249,6 +1309,9 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 	if (dev) {
 		phydev->attached_dev = dev;
 		dev->phydev = phydev;
+
+		if (phydev->sfp_bus_attached)
+			dev->sfp_bus = phydev->sfp_bus;
 	}
 
 	/* Some Ethernet drivers try to connect to a PHY device before
@@ -2418,6 +2481,9 @@ static int phy_remove(struct device *dev)
 	phydev->state = PHY_DOWN;
 	mutex_unlock(&phydev->lock);
 
+	sfp_bus_del_upstream(phydev->sfp_bus);
+	phydev->sfp_bus = NULL;
+
 	if (phydev->drv && phydev->drv->remove) {
 		phydev->drv->remove(phydev);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 78436d58ce7c..124516fe2763 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -203,6 +203,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 
 struct device;
 struct phylink;
+struct sfp_bus;
+struct sfp_upstream_ops;
 struct sk_buff;
 
 /*
@@ -342,6 +344,8 @@ struct phy_c45_device_ids {
  * dev_flags: Device-specific flags used by the PHY driver.
  * irq: IRQ number of the PHY's interrupt (-1 if none)
  * phy_timer: The timer for handling the state machine
+ * sfp_bus_attached: flag indicating whether the SFP bus has been attached
+ * sfp_bus: SFP bus attached to this PHY's fiber port
  * attached_dev: The attached enet driver's device instance ptr
  * adjust_link: Callback for the enet controller to respond to
  * changes in the link state.
@@ -432,6 +436,9 @@ struct phy_device {
 
 	struct mutex lock;
 
+	/* This may be modified under the rtnl lock */
+	bool sfp_bus_attached;
+	struct sfp_bus *sfp_bus;
 	struct phylink *phylink;
 	struct net_device *attached_dev;
 
@@ -1020,6 +1027,10 @@ int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
 int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
+void phy_sfp_attach(void *upstream, struct sfp_bus *bus);
+void phy_sfp_detach(void *upstream, struct sfp_bus *bus);
+int phy_sfp_probe(struct phy_device *phydev,
+	          const struct sfp_upstream_ops *ops);
 struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
 			      phy_interface_t interface);
 struct phy_device *phy_find_first(struct mii_bus *bus);
-- 
cgit v1.2.3-59-g8ed1b


From 7c9e69428da39ed761c9d903c4850368fa4ef7bf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 16 Nov 2019 12:22:43 +0100
Subject: page_pool: add destroy attempts counter and rename tracepoint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When Jonathan change the page_pool to become responsible to its
own shutdown via deferred work queue, then the disconnect_cnt
counter was removed from xdp memory model tracepoint.

This patch change the page_pool_inflight tracepoint name to
page_pool_release, because it reflects the new responsability
better.  And it reintroduces a counter that reflect the number of
times page_pool_release have been tried.

The counter is also used by the code, to only empty the alloc
cache once.  With a stuck work queue running every second and
counter being 64-bit, it will overrun in approx 584 billion
years. For comparison, Earth lifetime expectancy is 7.5 billion
years, before the Sun will engulf, and destroy, the Earth.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h          |  2 ++
 include/trace/events/page_pool.h |  9 ++++++---
 net/core/page_pool.c             | 13 +++++++++++--
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 1121faa99c12..ace881c15dcb 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -112,6 +112,8 @@ struct page_pool {
 	 * refcnt serves purpose is to simplify drivers error handling.
 	 */
 	refcount_t user_cnt;
+
+	u64 destroy_cnt;
 };
 
 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h
index 47b5ee880aa9..ee7f1aca7839 100644
--- a/include/trace/events/page_pool.h
+++ b/include/trace/events/page_pool.h
@@ -10,7 +10,7 @@
 
 #include <net/page_pool.h>
 
-TRACE_EVENT(page_pool_inflight,
+TRACE_EVENT(page_pool_release,
 
 	TP_PROTO(const struct page_pool *pool,
 		 s32 inflight, u32 hold, u32 release),
@@ -22,6 +22,7 @@ TRACE_EVENT(page_pool_inflight,
 		__field(s32,	inflight)
 		__field(u32,	hold)
 		__field(u32,	release)
+		__field(u64,	cnt)
 	),
 
 	TP_fast_assign(
@@ -29,10 +30,12 @@ TRACE_EVENT(page_pool_inflight,
 		__entry->inflight	= inflight;
 		__entry->hold		= hold;
 		__entry->release	= release;
+		__entry->cnt		= pool->destroy_cnt;
 	),
 
-	TP_printk("page_pool=%p inflight=%d hold=%u release=%u",
-	  __entry->pool, __entry->inflight, __entry->hold, __entry->release)
+	TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu",
+		__entry->pool, __entry->inflight, __entry->hold,
+		__entry->release, __entry->cnt)
 );
 
 TRACE_EVENT(page_pool_state_release,
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index dfc2501c35d9..e28db2ef8e12 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -200,7 +200,7 @@ static s32 page_pool_inflight(struct page_pool *pool)
 
 	inflight = _distance(hold_cnt, release_cnt);
 
-	trace_page_pool_inflight(pool, inflight, hold_cnt, release_cnt);
+	trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
 
 	return inflight;
@@ -349,10 +349,13 @@ static void page_pool_free(struct page_pool *pool)
 	kfree(pool);
 }
 
-static void page_pool_scrub(struct page_pool *pool)
+static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
 {
 	struct page *page;
 
+	if (pool->destroy_cnt)
+		return;
+
 	/* Empty alloc cache, assume caller made sure this is
 	 * no-longer in use, and page_pool_alloc_pages() cannot be
 	 * call concurrently.
@@ -361,6 +364,12 @@ static void page_pool_scrub(struct page_pool *pool)
 		page = pool->alloc.cache[--pool->alloc.count];
 		__page_pool_return_page(pool, page);
 	}
+}
+
+static void page_pool_scrub(struct page_pool *pool)
+{
+	page_pool_empty_alloc_cache_once(pool);
+	pool->destroy_cnt++;
 
 	/* No more consumers should exist, but producers could still
 	 * be in-flight.
-- 
cgit v1.2.3-59-g8ed1b


From 832ccf6f80cda06ad2373cd1f40291b0183958b4 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sat, 16 Nov 2019 12:22:48 +0100
Subject: page_pool: extend tracepoint to also include the page PFN

The MM tracepoint for page free (called kmem:mm_page_free) doesn't provide
the page pointer directly, instead it provides the PFN (Page Frame Number).
This is annoying when writing a page_pool leak detector in BPF.

This patch change page_pool tracepoints to also provide the PFN.
The page pointer is still provided to allow other kinds of
troubleshooting from BPF.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/page_pool.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h
index ee7f1aca7839..2f2a10e8eb56 100644
--- a/include/trace/events/page_pool.h
+++ b/include/trace/events/page_pool.h
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
+#include <trace/events/mmflags.h>
 #include <net/page_pool.h>
 
 TRACE_EVENT(page_pool_release,
@@ -49,16 +50,18 @@ TRACE_EVENT(page_pool_state_release,
 		__field(const struct page_pool *,	pool)
 		__field(const struct page *,		page)
 		__field(u32,				release)
+		__field(unsigned long,			pfn)
 	),
 
 	TP_fast_assign(
 		__entry->pool		= pool;
 		__entry->page		= page;
 		__entry->release	= release;
+		__entry->pfn		= page_to_pfn(page);
 	),
 
-	TP_printk("page_pool=%p page=%p release=%u",
-		  __entry->pool, __entry->page, __entry->release)
+	TP_printk("page_pool=%p page=%p pfn=%lu release=%u",
+		  __entry->pool, __entry->page, __entry->pfn, __entry->release)
 );
 
 TRACE_EVENT(page_pool_state_hold,
@@ -72,16 +75,18 @@ TRACE_EVENT(page_pool_state_hold,
 		__field(const struct page_pool *,	pool)
 		__field(const struct page *,		page)
 		__field(u32,				hold)
+		__field(unsigned long,			pfn)
 	),
 
 	TP_fast_assign(
 		__entry->pool	= pool;
 		__entry->page	= page;
 		__entry->hold	= hold;
+		__entry->pfn	= page_to_pfn(page);
 	),
 
-	TP_printk("page_pool=%p page=%p hold=%u",
-		  __entry->pool, __entry->page, __entry->hold)
+	TP_printk("page_pool=%p page=%p pfn=%lu hold=%u",
+		  __entry->pool, __entry->page, __entry->pfn, __entry->hold)
 );
 
 #endif /* _TRACE_PAGE_POOL_H */
-- 
cgit v1.2.3-59-g8ed1b


From 7cd9a58d6860ae09acd7f0c219b5fa333703f72f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Nov 2019 23:05:52 +0100
Subject: netfilter: nf_tables: constify nft_reg_load{8, 16, 64}()

This patch constifies the pointer to source register data that is passed
as an input parameter.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_tables.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 87b758407868..fe7c50acc681 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -114,7 +114,7 @@ static inline void nft_reg_store8(u32 *dreg, u8 val)
 	*(u8 *)dreg = val;
 }
 
-static inline u8 nft_reg_load8(u32 *sreg)
+static inline u8 nft_reg_load8(const u32 *sreg)
 {
 	return *(u8 *)sreg;
 }
@@ -125,7 +125,7 @@ static inline void nft_reg_store16(u32 *dreg, u16 val)
 	*(u16 *)dreg = val;
 }
 
-static inline u16 nft_reg_load16(u32 *sreg)
+static inline u16 nft_reg_load16(const u32 *sreg)
 {
 	return *(u16 *)sreg;
 }
@@ -135,7 +135,7 @@ static inline void nft_reg_store64(u32 *dreg, u64 val)
 	put_unaligned(val, (u64 *)dreg);
 }
 
-static inline u64 nft_reg_load64(u32 *sreg)
+static inline u64 nft_reg_load64(const u32 *sreg)
 {
 	return get_unaligned((u64 *)sreg);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 8819efc9430142957c9c8fc7c09d9107e2061b87 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Nov 2019 23:05:53 +0100
Subject: netfilter: nf_tables_offload: allow ethernet interface type only

Hardware offload support at this stage assumes an ethernet device in
place. The flow dissector provides the intermediate representation to
express this selector, so extend it to allow to store the interface
type. Flower does not uses this, so skb_flow_dissect_meta() is not
extended to match on this new field.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 2 ++
 net/netfilter/nft_cmp.c      | 6 ++++++
 net/netfilter/nft_meta.c     | 4 ++++
 3 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index b1063db63e66..1a0727d1acfa 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -203,9 +203,11 @@ struct flow_dissector_key_ip {
 /**
  * struct flow_dissector_key_meta:
  * @ingress_ifindex: ingress ifindex
+ * @ingress_iftype: ingress interface type
  */
 struct flow_dissector_key_meta {
 	int ingress_ifindex;
+	u16 ingress_iftype;
 };
 
 /**
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 0744b2bb46da..b8092069f868 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
+#include <linux/if_arp.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables_offload.h>
@@ -125,6 +126,11 @@ static int __nft_cmp_offload(struct nft_offload_ctx *ctx,
 	flow->match.dissector.used_keys |= BIT(reg->key);
 	flow->match.dissector.offset[reg->key] = reg->base_offset;
 
+	if (reg->key == FLOW_DISSECTOR_KEY_META &&
+	    reg->offset == offsetof(struct nft_flow_key, meta.ingress_iftype) &&
+	    nft_reg_load16(priv->data.data) != ARPHRD_ETHER)
+		return -EOPNOTSUPP;
+
 	nft_offload_update_dependency(ctx, &priv->data, priv->len);
 
 	return 0;
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 8fbea031bd4a..9740b554fdb3 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -551,6 +551,10 @@ static int nft_meta_get_offload(struct nft_offload_ctx *ctx,
 		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
 				  ingress_ifindex, sizeof(__u32), reg);
 		break;
+	case NFT_META_IIFTYPE:
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_META, meta,
+				  ingress_iftype, sizeof(__u16), reg);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From a82055af595946aea461528e551e6ae064b3d560 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Nov 2019 23:05:54 +0100
Subject: netfilter: nft_payload: add VLAN offload support

Match on ethertype and set up protocol dependency. Check for protocol
dependency before accessing the tci field. Allow to match on the
encapsulated ethertype too.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  9 ++++++---
 net/netfilter/nft_payload.c  | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 1a0727d1acfa..f06b0239c32b 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -48,9 +48,12 @@ struct flow_dissector_key_tags {
 };
 
 struct flow_dissector_key_vlan {
-	u16	vlan_id:12,
-		vlan_dei:1,
-		vlan_priority:3;
+	union {
+		u16	vlan_id:12,
+			vlan_dei:1,
+			vlan_priority:3;
+		__be16	vlan_tci;
+	};
 	__be16	vlan_tpid;
 };
 
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 0877d46b8605..f17939fbf6c3 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -182,6 +182,28 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx,
 		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs,
 				  dst, ETH_ALEN, reg);
 		break;
+	case offsetof(struct ethhdr, h_proto):
+		if (priv->len != sizeof(__be16))
+			return -EOPNOTSUPP;
+
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic,
+				  n_proto, sizeof(__be16), reg);
+		nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK);
+		break;
+	case offsetof(struct vlan_ethhdr, h_vlan_TCI):
+		if (priv->len != sizeof(__be16))
+			return -EOPNOTSUPP;
+
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
+				  vlan_tci, sizeof(__be16), reg);
+		break;
+	case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto):
+		if (priv->len != sizeof(__be16))
+			return -EOPNOTSUPP;
+
+		NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan,
+				  vlan_tpid, sizeof(__be16), reg);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From bc836748707cf6b8b1a948b61149278f109107da Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed, 20 Nov 2019 00:15:17 +0000
Subject: page_pool: Add API to update numa node

Add page_pool_update_nid() to be called by page pool consumers when they
detect numa node changes.

It will update the page pool nid value to start allocating from the new
effective numa node.

This is to mitigate page pool allocating pages from a wrong numa node,
where the pool was originally allocated, and holding on to pages that
belong to a different numa node, which causes performance degradation.

For pages that are already being consumed and could be returned to the
pool by the consumer, in next patch we will add a check per page to avoid
recycling them back to the pool and return them to the page allocator.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h          |  7 +++++++
 include/trace/events/page_pool.h | 22 ++++++++++++++++++++++
 net/core/page_pool.c             |  8 ++++++++
 3 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ace881c15dcb..e2e1b7b1e8ba 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -204,4 +204,11 @@ static inline bool page_pool_put(struct page_pool *pool)
 	return refcount_dec_and_test(&pool->user_cnt);
 }
 
+/* Caller must provide appropriate safe context, e.g. NAPI. */
+void page_pool_update_nid(struct page_pool *pool, int new_nid);
+static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
+{
+	if (unlikely(pool->p.nid != new_nid))
+		page_pool_update_nid(pool, new_nid);
+}
 #endif /* _NET_PAGE_POOL_H */
diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h
index 2f2a10e8eb56..ad0aa7f31675 100644
--- a/include/trace/events/page_pool.h
+++ b/include/trace/events/page_pool.h
@@ -89,6 +89,28 @@ TRACE_EVENT(page_pool_state_hold,
 		  __entry->pool, __entry->page, __entry->pfn, __entry->hold)
 );
 
+TRACE_EVENT(page_pool_update_nid,
+
+	TP_PROTO(const struct page_pool *pool, int new_nid),
+
+	TP_ARGS(pool, new_nid),
+
+	TP_STRUCT__entry(
+		__field(const struct page_pool *, pool)
+		__field(int,			  pool_nid)
+		__field(int,			  new_nid)
+	),
+
+	TP_fast_assign(
+		__entry->pool		= pool;
+		__entry->pool_nid	= pool->p.nid;
+		__entry->new_nid	= new_nid;
+	),
+
+	TP_printk("page_pool=%p pool_nid=%d new_nid=%d",
+		  __entry->pool, __entry->pool_nid, __entry->new_nid)
+);
+
 #endif /* _TRACE_PAGE_POOL_H */
 
 /* This part must be outside protection */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index e28db2ef8e12..9b704ea3f4b2 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -436,3 +436,11 @@ void page_pool_destroy(struct page_pool *pool)
 	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
 }
 EXPORT_SYMBOL(page_pool_destroy);
+
+/* Caller must provide appropriate safe context, e.g. NAPI. */
+void page_pool_update_nid(struct page_pool *pool, int new_nid)
+{
+	trace_page_pool_update_nid(pool, new_nid);
+	pool->p.nid = new_nid;
+}
+EXPORT_SYMBOL(page_pool_update_nid);
-- 
cgit v1.2.3-59-g8ed1b


From cec2975f2b7058c42330a0f8164d94c6b7c8c446 Mon Sep 17 00:00:00 2001
From: Gautam Ramakrishnan <gautamramk@gmail.com>
Date: Wed, 20 Nov 2019 19:43:54 +0530
Subject: net: sched: pie: enable timestamp based delay calculation

RFC 8033 suggests an alternative approach to calculate the queue
delay in PIE by using a timestamp on every enqueued packet. This
patch adds an implementation of that approach and sets it as the
default method to calculate queue delay. The previous method (based
on Little's law) to calculate queue delay is set as optional.

Signed-off-by: Gautam Ramakrishnan <gautamramk@gmail.com>
Signed-off-by: Leslie Monis <lesliemonis@gmail.com>
Signed-off-by: Mohit P. Tahiliani <tahiliani@nitk.edu.in>
Acked-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h |  22 +++++---
 net/sched/sch_pie.c            | 120 +++++++++++++++++++++++++++++++++--------
 2 files changed, 113 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 5011259b8f67..9f1a72876212 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -950,19 +950,25 @@ enum {
 	TCA_PIE_BETA,
 	TCA_PIE_ECN,
 	TCA_PIE_BYTEMODE,
+	TCA_PIE_DQ_RATE_ESTIMATOR,
 	__TCA_PIE_MAX
 };
 #define TCA_PIE_MAX   (__TCA_PIE_MAX - 1)
 
 struct tc_pie_xstats {
-	__u64 prob;             /* current probability */
-	__u32 delay;            /* current delay in ms */
-	__u32 avg_dq_rate;      /* current average dq_rate in bits/pie_time */
-	__u32 packets_in;       /* total number of packets enqueued */
-	__u32 dropped;          /* packets dropped due to pie_action */
-	__u32 overlimit;        /* dropped due to lack of space in queue */
-	__u32 maxq;             /* maximum queue size */
-	__u32 ecn_mark;         /* packets marked with ecn*/
+	__u64 prob;			/* current probability */
+	__u32 delay;			/* current delay in ms */
+	__u32 avg_dq_rate;		/* current average dq_rate in
+					 * bits/pie_time
+					 */
+	__u32 dq_rate_estimating;	/* is avg_dq_rate being calculated? */
+	__u32 packets_in;		/* total number of packets enqueued */
+	__u32 dropped;			/* packets dropped due to pie_action */
+	__u32 overlimit;		/* dropped due to lack of space
+					 * in queue
+					 */
+	__u32 maxq;			/* maximum queue size */
+	__u32 ecn_mark;			/* packets marked with ecn*/
 };
 
 /* CBS */
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index df98a887eb89..b0b0dc46af61 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -22,6 +22,7 @@
 
 #define QUEUE_THRESHOLD 16384
 #define DQCOUNT_INVALID -1
+#define DTIME_INVALID 0xffffffffffffffff
 #define MAX_PROB 0xffffffffffffffff
 #define PIE_SCALE 8
 
@@ -34,6 +35,7 @@ struct pie_params {
 	u32 beta;		/* and are used for shift relative to 1 */
 	bool ecn;		/* true if ecn is enabled */
 	bool bytemode;		/* to scale drop early prob based on pkt size */
+	u8 dq_rate_estimator;	/* to calculate delay using Little's law */
 };
 
 /* variables used */
@@ -77,11 +79,34 @@ static void pie_params_init(struct pie_params *params)
 	params->target = PSCHED_NS2TICKS(15 * NSEC_PER_MSEC);	/* 15 ms */
 	params->ecn = false;
 	params->bytemode = false;
+	params->dq_rate_estimator = false;
+}
+
+/* private skb vars */
+struct pie_skb_cb {
+	psched_time_t enqueue_time;
+};
+
+static struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct pie_skb_cb));
+	return (struct pie_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static psched_time_t pie_get_enqueue_time(const struct sk_buff *skb)
+{
+	return get_pie_cb(skb)->enqueue_time;
+}
+
+static void pie_set_enqueue_time(struct sk_buff *skb)
+{
+	get_pie_cb(skb)->enqueue_time = psched_get_time();
 }
 
 static void pie_vars_init(struct pie_vars *vars)
 {
 	vars->dq_count = DQCOUNT_INVALID;
+	vars->dq_tstamp = DTIME_INVALID;
 	vars->accu_prob = 0;
 	vars->avg_dq_rate = 0;
 	/* default of 150 ms in pschedtime */
@@ -172,6 +197,10 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 	/* we can enqueue the packet */
 	if (enqueue) {
+		/* Set enqueue time only when dq_rate_estimator is disabled. */
+		if (!q->params.dq_rate_estimator)
+			pie_set_enqueue_time(skb);
+
 		q->stats.packets_in++;
 		if (qdisc_qlen(sch) > q->stats.maxq)
 			q->stats.maxq = qdisc_qlen(sch);
@@ -194,6 +223,7 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
 	[TCA_PIE_BETA] = {.type = NLA_U32},
 	[TCA_PIE_ECN] = {.type = NLA_U32},
 	[TCA_PIE_BYTEMODE] = {.type = NLA_U32},
+	[TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32},
 };
 
 static int pie_change(struct Qdisc *sch, struct nlattr *opt,
@@ -247,6 +277,10 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_PIE_BYTEMODE])
 		q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
 
+	if (tb[TCA_PIE_DQ_RATE_ESTIMATOR])
+		q->params.dq_rate_estimator =
+				nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]);
+
 	/* Drop excess packets if new limit is lower */
 	qlen = sch->q.qlen;
 	while (sch->q.qlen > sch->limit) {
@@ -266,6 +300,28 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct pie_sched_data *q = qdisc_priv(sch);
 	int qlen = sch->qstats.backlog;	/* current queue size in bytes */
+	psched_time_t now = psched_get_time();
+	u32 dtime = 0;
+
+	/* If dq_rate_estimator is disabled, calculate qdelay using the
+	 * packet timestamp.
+	 */
+	if (!q->params.dq_rate_estimator) {
+		q->vars.qdelay = now - pie_get_enqueue_time(skb);
+
+		if (q->vars.dq_tstamp != DTIME_INVALID)
+			dtime = now - q->vars.dq_tstamp;
+
+		q->vars.dq_tstamp = now;
+
+		if (qlen == 0)
+			q->vars.qdelay = 0;
+
+		if (dtime == 0)
+			return;
+
+		goto burst_allowance_reduction;
+	}
 
 	/* If current queue is about 10 packets or more and dq_count is unset
 	 * we have enough packets to calculate the drain rate. Save
@@ -289,10 +345,10 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
 		q->vars.dq_count += skb->len;
 
 		if (q->vars.dq_count >= QUEUE_THRESHOLD) {
-			psched_time_t now = psched_get_time();
-			u32 dtime = now - q->vars.dq_tstamp;
 			u32 count = q->vars.dq_count << PIE_SCALE;
 
+			dtime = now - q->vars.dq_tstamp;
+
 			if (dtime == 0)
 				return;
 
@@ -317,14 +373,19 @@ static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
 				q->vars.dq_tstamp = psched_get_time();
 			}
 
-			if (q->vars.burst_time > 0) {
-				if (q->vars.burst_time > dtime)
-					q->vars.burst_time -= dtime;
-				else
-					q->vars.burst_time = 0;
-			}
+			goto burst_allowance_reduction;
 		}
 	}
+
+	return;
+
+burst_allowance_reduction:
+	if (q->vars.burst_time > 0) {
+		if (q->vars.burst_time > dtime)
+			q->vars.burst_time -= dtime;
+		else
+			q->vars.burst_time = 0;
+	}
 }
 
 static void calculate_probability(struct Qdisc *sch)
@@ -332,19 +393,25 @@ static void calculate_probability(struct Qdisc *sch)
 	struct pie_sched_data *q = qdisc_priv(sch);
 	u32 qlen = sch->qstats.backlog;	/* queue size in bytes */
 	psched_time_t qdelay = 0;	/* in pschedtime */
-	psched_time_t qdelay_old = q->vars.qdelay;	/* in pschedtime */
+	psched_time_t qdelay_old = 0;	/* in pschedtime */
 	s64 delta = 0;		/* determines the change in probability */
 	u64 oldprob;
 	u64 alpha, beta;
 	u32 power;
 	bool update_prob = true;
 
-	q->vars.qdelay_old = q->vars.qdelay;
+	if (q->params.dq_rate_estimator) {
+		qdelay_old = q->vars.qdelay;
+		q->vars.qdelay_old = q->vars.qdelay;
 
-	if (q->vars.avg_dq_rate > 0)
-		qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
-	else
-		qdelay = 0;
+		if (q->vars.avg_dq_rate > 0)
+			qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
+		else
+			qdelay = 0;
+	} else {
+		qdelay = q->vars.qdelay;
+		qdelay_old = q->vars.qdelay_old;
+	}
 
 	/* If qdelay is zero and qlen is not, it means qlen is very small, less
 	 * than dequeue_rate, so we do not update probabilty in this round
@@ -430,14 +497,18 @@ static void calculate_probability(struct Qdisc *sch)
 	/* We restart the measurement cycle if the following conditions are met
 	 * 1. If the delay has been low for 2 consecutive Tupdate periods
 	 * 2. Calculated drop probability is zero
-	 * 3. We have atleast one estimate for the avg_dq_rate ie.,
-	 *    is a non-zero value
+	 * 3. If average dq_rate_estimator is enabled, we have atleast one
+	 *    estimate for the avg_dq_rate ie., is a non-zero value
 	 */
 	if ((q->vars.qdelay < q->params.target / 2) &&
 	    (q->vars.qdelay_old < q->params.target / 2) &&
 	    q->vars.prob == 0 &&
-	    q->vars.avg_dq_rate > 0)
+	    (!q->params.dq_rate_estimator || q->vars.avg_dq_rate > 0)) {
 		pie_vars_init(&q->vars);
+	}
+
+	if (!q->params.dq_rate_estimator)
+		q->vars.qdelay_old = qdelay;
 }
 
 static void pie_timer(struct timer_list *t)
@@ -497,7 +568,9 @@ static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
 	    nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
 	    nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
-	    nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
+	    nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) ||
+	    nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR,
+			q->params.dq_rate_estimator))
 		goto nla_put_failure;
 
 	return nla_nest_end(skb, opts);
@@ -514,9 +587,6 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 		.prob		= q->vars.prob,
 		.delay		= ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
 				   NSEC_PER_USEC,
-		/* unscale and return dq_rate in bytes per sec */
-		.avg_dq_rate	= q->vars.avg_dq_rate *
-				  (PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
 		.packets_in	= q->stats.packets_in,
 		.overlimit	= q->stats.overlimit,
 		.maxq		= q->stats.maxq,
@@ -524,6 +594,14 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 		.ecn_mark	= q->stats.ecn_mark,
 	};
 
+	/* avg_dq_rate is only valid if dq_rate_estimator is enabled */
+	st.dq_rate_estimating = q->params.dq_rate_estimator;
+
+	/* unscale and return dq_rate in bytes per sec */
+	if (q->params.dq_rate_estimator)
+		st.avg_dq_rate = q->vars.avg_dq_rate *
+				 (PSCHED_TICKS_PER_SEC) >> PIE_SCALE;
+
 	return gnet_stats_copy_app(d, &st, sizeof(st));
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From e68bc75691cc3de608c2c7505057c948d13ae587 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Wed, 20 Nov 2019 16:54:18 +0200
Subject: net: page_pool: add the possibility to sync DMA memory for device

Introduce the following parameters in order to add the possibility to sync
DMA memory for device before putting allocated pages in the page_pool
caches:
- PP_FLAG_DMA_SYNC_DEV: if set in page_pool_params flags, all pages that
  the driver gets from page_pool will be DMA-synced-for-device according
  to the length provided by the device driver. Please note DMA-sync-for-CPU
  is still device driver responsibility
- offset: DMA address offset where the DMA engine starts copying rx data
- max_len: maximum DMA memory size page_pool is allowed to flush. This
  is currently used in __page_pool_alloc_pages_slow routine when pages
  are allocated from page allocator
These parameters are supposed to be set by device drivers.

This optimization reduces the length of the DMA-sync-for-device.
The optimization is valid because pages are initially
DMA-synced-for-device as defined via max_len. At RX time, the driver
will perform a DMA-sync-for-CPU on the memory for the packet length.
What is important is the memory occupied by packet payload, because
this is the area CPU is allowed to read and modify. As we don't track
cache-lines written into by the CPU, simply use the packet payload length
as dma_sync_size at page_pool recycle time. This also take into account
any tail-extend.

Tested-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/page_pool.h | 24 ++++++++++++++++++------
 net/core/page_pool.c    | 36 ++++++++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index e2e1b7b1e8ba..cfbed00ba7ee 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -34,8 +34,18 @@
 #include <linux/ptr_ring.h>
 #include <linux/dma-direction.h>
 
-#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */
-#define PP_FLAG_ALL	PP_FLAG_DMA_MAP
+#define PP_FLAG_DMA_MAP		BIT(0) /* Should page_pool do the DMA
+					* map/unmap
+					*/
+#define PP_FLAG_DMA_SYNC_DEV	BIT(1) /* If set all pages that the driver gets
+					* from page_pool will be
+					* DMA-synced-for-device according to
+					* the length provided by the device
+					* driver.
+					* Please note DMA-sync-for-CPU is still
+					* device driver responsibility
+					*/
+#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)
 
 /*
  * Fast allocation side cache array/stack
@@ -65,6 +75,8 @@ struct page_pool_params {
 	int		nid;  /* Numa node id to allocate from pages from */
 	struct device	*dev; /* device, for DMA pre-mapping purposes */
 	enum dma_data_direction dma_dir; /* DMA mapping direction */
+	unsigned int	max_len; /* max DMA sync memory size */
+	unsigned int	offset;  /* DMA addr offset */
 };
 
 struct page_pool {
@@ -151,8 +163,8 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool,
 #endif
 
 /* Never call this directly, use helpers below */
-void __page_pool_put_page(struct page_pool *pool,
-			  struct page *page, bool allow_direct);
+void __page_pool_put_page(struct page_pool *pool, struct page *page,
+			  unsigned int dma_sync_size, bool allow_direct);
 
 static inline void page_pool_put_page(struct page_pool *pool,
 				      struct page *page, bool allow_direct)
@@ -161,14 +173,14 @@ static inline void page_pool_put_page(struct page_pool *pool,
 	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
 	 */
 #ifdef CONFIG_PAGE_POOL
-	__page_pool_put_page(pool, page, allow_direct);
+	__page_pool_put_page(pool, page, -1, allow_direct);
 #endif
 }
 /* Very limited use-cases allow recycle direct */
 static inline void page_pool_recycle_direct(struct page_pool *pool,
 					    struct page *page)
 {
-	__page_pool_put_page(pool, page, true);
+	__page_pool_put_page(pool, page, -1, true);
 }
 
 /* Disconnects a page (from a page_pool).  API users can have a need
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 6c7f78bd6421..a6aefe989043 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -47,6 +47,21 @@ static int page_pool_init(struct page_pool *pool,
 	    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 		return -EINVAL;
 
+	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
+		/* In order to request DMA-sync-for-device the page
+		 * needs to be mapped
+		 */
+		if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+			return -EINVAL;
+
+		if (!pool->p.max_len)
+			return -EINVAL;
+
+		/* pool->p.offset has to be set according to the address
+		 * offset used by the DMA engine to start copying rx data
+		 */
+	}
+
 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 		return -ENOMEM;
 
@@ -115,6 +130,16 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
 	return page;
 }
 
+static void page_pool_dma_sync_for_device(struct page_pool *pool,
+					  struct page *page,
+					  unsigned int dma_sync_size)
+{
+	dma_sync_size = min(dma_sync_size, pool->p.max_len);
+	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+					 pool->p.offset, dma_sync_size,
+					 pool->p.dma_dir);
+}
+
 /* slow path */
 noinline
 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
@@ -159,6 +184,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 	}
 	page->dma_addr = dma;
 
+	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+
 skip_dma_map:
 	/* Track how many pages are held 'in-flight' */
 	pool->pages_state_hold_cnt++;
@@ -292,8 +320,8 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page)
 	return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid;
 }
 
-void __page_pool_put_page(struct page_pool *pool,
-			  struct page *page, bool allow_direct)
+void __page_pool_put_page(struct page_pool *pool, struct page *page,
+			  unsigned int dma_sync_size, bool allow_direct)
 {
 	/* This allocator is optimized for the XDP mode that uses
 	 * one-frame-per-page, but have fallbacks that act like the
@@ -305,6 +333,10 @@ void __page_pool_put_page(struct page_pool *pool,
 		   pool_page_reusable(pool, page))) {
 		/* Read barrier done in page_ref_count / READ_ONCE */
 
+		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+			page_pool_dma_sync_for_device(pool, page,
+						      dma_sync_size);
+
 		if (allow_direct && in_serving_softirq())
 			if (__page_pool_recycle_direct(page, pool))
 				return;
-- 
cgit v1.2.3-59-g8ed1b


From 91e6015b082b08a74e5d9d326f651e5890a93519 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 20 Nov 2019 22:38:16 +0100
Subject: bpf: Emit audit messages upon successful prog load and unload

Allow for audit messages to be emitted upon BPF program load and
unload for having a timeline of events. The load itself is in
syscall context, so additional info about the process initiating
the BPF prog creation can be logged and later directly correlated
to the unload event.

The only info really needed from BPF side is the globally unique
prog ID where then audit user space tooling can query / dump all
info needed about the specific BPF program right upon load event
and enrich the record, thus these changes needed here can be kept
small and non-intrusive to the core.

Raw example output:

  # auditctl -D
  # auditctl -a always,exit -F arch=x86_64 -S bpf
  # ausearch --start recent -m 1334
  [...]
  ----
  time->Wed Nov 20 12:45:51 2019
  type=PROCTITLE msg=audit(1574271951.590:8974): proctitle="./test_verifier"
  type=SYSCALL msg=audit(1574271951.590:8974): arch=c000003e syscall=321 success=yes exit=14 a0=5 a1=7ffe2d923e80 a2=78 a3=0 items=0 ppid=742 pid=949 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=2 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null)
  type=UNKNOWN[1334] msg=audit(1574271951.590:8974): auid=0 uid=0 gid=0 ses=2 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 pid=949 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" prog-id=3260 event=LOAD
  ----
  time->Wed Nov 20 12:45:51 2019
type=UNKNOWN[1334] msg=audit(1574271951.590:8975): prog-id=3260 event=UNLOAD
  ----
  [...]

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20191120213816.8186-1-jolsa@kernel.org
---
 include/linux/audit.h      |  3 +++
 include/uapi/linux/audit.h |  1 +
 kernel/auditsc.c           |  2 +-
 kernel/bpf/syscall.c       | 31 +++++++++++++++++++++++++++++++
 4 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index aee3dc9eb378..edd006f4597d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -159,6 +159,7 @@ extern void		    audit_log_key(struct audit_buffer *ab,
 extern void		    audit_log_link_denied(const char *operation);
 extern void		    audit_log_lost(const char *message);
 
+extern void audit_log_task(struct audit_buffer *ab);
 extern int audit_log_task_context(struct audit_buffer *ab);
 extern void audit_log_task_info(struct audit_buffer *ab);
 
@@ -219,6 +220,8 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key)
 { }
 static inline void audit_log_link_denied(const char *string)
 { }
+static inline void audit_log_task(struct audit_buffer *ab)
+{ }
 static inline int audit_log_task_context(struct audit_buffer *ab)
 {
 	return 0;
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index c89c6495983d..32a5db900f47 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -116,6 +116,7 @@
 #define AUDIT_FANOTIFY		1331	/* Fanotify access decision */
 #define AUDIT_TIME_INJOFFSET	1332	/* Timekeeping offset injected */
 #define AUDIT_TIME_ADJNTPVAL	1333	/* NTP value adjustment */
+#define AUDIT_BPF		1334	/* BPF subsystem */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4effe01ebbe2..9bf1045fedfa 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2545,7 +2545,7 @@ void __audit_ntp_log(const struct audit_ntp_data *ad)
 	audit_log_ntp_val(ad, "adjust",	AUDIT_NTP_ADJUST);
 }
 
-static void audit_log_task(struct audit_buffer *ab)
+void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
 	kgid_t gid;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bac3becf9f90..17f4254495f2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,7 @@
 #include <linux/timekeeping.h>
 #include <linux/ctype.h>
 #include <linux/nospec.h>
+#include <linux/audit.h>
 #include <uapi/linux/btf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
@@ -1318,6 +1319,34 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 	kfree(aux->used_maps);
 }
 
+enum bpf_event {
+	BPF_EVENT_LOAD,
+	BPF_EVENT_UNLOAD,
+};
+
+static const char * const bpf_event_audit_str[] = {
+	[BPF_EVENT_LOAD]   = "LOAD",
+	[BPF_EVENT_UNLOAD] = "UNLOAD",
+};
+
+static void bpf_audit_prog(const struct bpf_prog *prog, enum bpf_event event)
+{
+	bool has_task_context = event == BPF_EVENT_LOAD;
+	struct audit_buffer *ab;
+
+	if (audit_enabled == AUDIT_OFF)
+		return;
+	ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_BPF);
+	if (unlikely(!ab))
+		return;
+	if (has_task_context)
+		audit_log_task(ab);
+	audit_log_format(ab, "%sprog-id=%u event=%s",
+			 has_task_context ? " " : "",
+			 prog->aux->id, bpf_event_audit_str[event]);
+	audit_log_end(ab);
+}
+
 int __bpf_prog_charge(struct user_struct *user, u32 pages)
 {
 	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1434,6 +1463,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic64_dec_and_test(&prog->aux->refcnt)) {
 		perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+		bpf_audit_prog(prog, BPF_EVENT_UNLOAD);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 		__bpf_prog_put_noref(prog, true);
@@ -1843,6 +1873,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	 */
 	bpf_prog_kallsyms_add(prog);
 	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
+	bpf_audit_prog(prog, BPF_EVENT_LOAD);
 
 	err = bpf_prog_new_fd(prog);
 	if (err < 0)
-- 
cgit v1.2.3-59-g8ed1b


From 196e8ca74886c433dcfc64a809707074b936aaf5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 20 Nov 2019 23:04:44 +0100
Subject: bpf: Switch bpf_map_{area_alloc,area_mmapable_alloc}() to u64 size

Given we recently extended the original bpf_map_area_alloc() helper in
commit fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY"),
we need to apply the same logic as in ff1c08e1f74b ("bpf: Change size
to u64 for bpf_map_{area_alloc, charge_init}()"). To avoid conflicts,
extend it for bpf-next.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  |  6 +++---
 kernel/bpf/syscall.c | 11 +++++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e913dd5946ae..e89e86122233 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -794,12 +794,12 @@ void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
-int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size);
+int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size);
 void bpf_map_charge_finish(struct bpf_map_memory *mem);
 void bpf_map_charge_move(struct bpf_map_memory *dst,
 			 struct bpf_map_memory *src);
-void *bpf_map_area_alloc(size_t size, int numa_node);
-void *bpf_map_area_mmapable_alloc(size_t size, int numa_node);
+void *bpf_map_area_alloc(u64 size, int numa_node);
+void *bpf_map_area_mmapable_alloc(u64 size, int numa_node);
 void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 17f4254495f2..b51ecb9644d0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -128,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }
 
-static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable)
+static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
 {
 	/* We really just want to fail instead of triggering OOM killer
 	 * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
@@ -143,6 +143,9 @@ static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable)
 	const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
 	void *area;
 
+	if (size >= SIZE_MAX)
+		return NULL;
+
 	/* kmalloc()'ed memory can't be mmap()'ed */
 	if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
 		area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
@@ -160,12 +163,12 @@ static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable)
 					   flags, __builtin_return_address(0));
 }
 
-void *bpf_map_area_alloc(size_t size, int numa_node)
+void *bpf_map_area_alloc(u64 size, int numa_node)
 {
 	return __bpf_map_area_alloc(size, numa_node, false);
 }
 
-void *bpf_map_area_mmapable_alloc(size_t size, int numa_node)
+void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
 {
 	return __bpf_map_area_alloc(size, numa_node, true);
 }
@@ -214,7 +217,7 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
 		atomic_long_sub(pages, &user->locked_vm);
 }
 
-int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size)
+int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size)
 {
 	u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
 	struct user_struct *user;
-- 
cgit v1.2.3-59-g8ed1b


From f3c9a666b28572b1a0ae691a47d9a7de4d9cefb3 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 20 Nov 2019 12:29:59 +0000
Subject: net: sfp: soft status and control support

Add support for the soft status and control register, which allows
TX_FAULT and RX_LOS to be monitored and TX_DISABLE to be set.  We
make use of this when the board does not support GPIOs for these
signals.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp.c | 110 +++++++++++++++++++++++++++++++++++++++++---------
 include/linux/sfp.h   |   4 ++
 2 files changed, 94 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index b0f88c2c0153..bdbbb76f8fd3 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -201,7 +201,10 @@ struct sfp {
 	struct gpio_desc *gpio[GPIO_MAX];
 	int gpio_irq[GPIO_MAX];
 
+	bool need_poll;
+
 	struct mutex st_mutex;			/* Protects state */
+	unsigned int state_soft_mask;
 	unsigned int state;
 	struct delayed_work poll;
 	struct delayed_work timeout;
@@ -395,24 +398,90 @@ static int sfp_i2c_configure(struct sfp *sfp, struct i2c_adapter *i2c)
 }
 
 /* Interface */
-static unsigned int sfp_get_state(struct sfp *sfp)
+static int sfp_read(struct sfp *sfp, bool a2, u8 addr, void *buf, size_t len)
 {
-	return sfp->get_state(sfp);
+	return sfp->read(sfp, a2, addr, buf, len);
 }
 
-static void sfp_set_state(struct sfp *sfp, unsigned int state)
+static int sfp_write(struct sfp *sfp, bool a2, u8 addr, void *buf, size_t len)
 {
-	sfp->set_state(sfp, state);
+	return sfp->write(sfp, a2, addr, buf, len);
 }
 
-static int sfp_read(struct sfp *sfp, bool a2, u8 addr, void *buf, size_t len)
+static unsigned int sfp_soft_get_state(struct sfp *sfp)
 {
-	return sfp->read(sfp, a2, addr, buf, len);
+	unsigned int state = 0;
+	u8 status;
+
+	if (sfp_read(sfp, true, SFP_STATUS, &status, sizeof(status)) ==
+		     sizeof(status)) {
+		if (status & SFP_STATUS_RX_LOS)
+			state |= SFP_F_LOS;
+		if (status & SFP_STATUS_TX_FAULT)
+			state |= SFP_F_TX_FAULT;
+	}
+
+	return state & sfp->state_soft_mask;
 }
 
-static int sfp_write(struct sfp *sfp, bool a2, u8 addr, void *buf, size_t len)
+static void sfp_soft_set_state(struct sfp *sfp, unsigned int state)
 {
-	return sfp->write(sfp, a2, addr, buf, len);
+	u8 status;
+
+	if (sfp_read(sfp, true, SFP_STATUS, &status, sizeof(status)) ==
+		     sizeof(status)) {
+		if (state & SFP_F_TX_DISABLE)
+			status |= SFP_STATUS_TX_DISABLE_FORCE;
+		else
+			status &= ~SFP_STATUS_TX_DISABLE_FORCE;
+
+		sfp_write(sfp, true, SFP_STATUS, &status, sizeof(status));
+	}
+}
+
+static void sfp_soft_start_poll(struct sfp *sfp)
+{
+	const struct sfp_eeprom_id *id = &sfp->id;
+
+	sfp->state_soft_mask = 0;
+	if (id->ext.enhopts & SFP_ENHOPTS_SOFT_TX_DISABLE &&
+	    !sfp->gpio[GPIO_TX_DISABLE])
+		sfp->state_soft_mask |= SFP_F_TX_DISABLE;
+	if (id->ext.enhopts & SFP_ENHOPTS_SOFT_TX_FAULT &&
+	    !sfp->gpio[GPIO_TX_FAULT])
+		sfp->state_soft_mask |= SFP_F_TX_FAULT;
+	if (id->ext.enhopts & SFP_ENHOPTS_SOFT_RX_LOS &&
+	    !sfp->gpio[GPIO_LOS])
+		sfp->state_soft_mask |= SFP_F_LOS;
+
+	if (sfp->state_soft_mask & (SFP_F_LOS | SFP_F_TX_FAULT) &&
+	    !sfp->need_poll)
+		mod_delayed_work(system_wq, &sfp->poll, poll_jiffies);
+}
+
+static void sfp_soft_stop_poll(struct sfp *sfp)
+{
+	sfp->state_soft_mask = 0;
+}
+
+static unsigned int sfp_get_state(struct sfp *sfp)
+{
+	unsigned int state = sfp->get_state(sfp);
+
+	if (state & SFP_F_PRESENT &&
+	    sfp->state_soft_mask & (SFP_F_LOS | SFP_F_TX_FAULT))
+		state |= sfp_soft_get_state(sfp);
+
+	return state;
+}
+
+static void sfp_set_state(struct sfp *sfp, unsigned int state)
+{
+	sfp->set_state(sfp, state);
+
+	if (state & SFP_F_PRESENT &&
+	    sfp->state_soft_mask & SFP_F_TX_DISABLE)
+		sfp_soft_set_state(sfp, state);
 }
 
 static unsigned int sfp_check(void *buf, size_t len)
@@ -1407,11 +1476,6 @@ static void sfp_sm_fault(struct sfp *sfp, unsigned int next_state, bool warn)
 	}
 }
 
-static void sfp_sm_mod_init(struct sfp *sfp)
-{
-	sfp_module_tx_enable(sfp);
-}
-
 static void sfp_sm_probe_for_phy(struct sfp *sfp)
 {
 	/* Setting the serdes link mode is guesswork: there's no
@@ -1574,7 +1638,7 @@ static int sfp_sm_mod_probe(struct sfp *sfp, bool report)
 		 (int)sizeof(id.ext.datecode), id.ext.datecode);
 
 	/* Check whether we support this module */
-	if (!sfp->type->module_supported(&sfp->id)) {
+	if (!sfp->type->module_supported(&id)) {
 		dev_err(sfp->dev,
 			"module is not supported - phys id 0x%02x 0x%02x\n",
 			sfp->id.base.phys_id, sfp->id.base.phys_ext_id);
@@ -1764,6 +1828,7 @@ static void sfp_sm_main(struct sfp *sfp, unsigned int event)
 		if (sfp->mod_phy)
 			sfp_sm_phy_detach(sfp);
 		sfp_module_tx_disable(sfp);
+		sfp_soft_stop_poll(sfp);
 		sfp_sm_next(sfp, SFP_S_DOWN, 0);
 		return;
 	}
@@ -1775,7 +1840,10 @@ static void sfp_sm_main(struct sfp *sfp, unsigned int event)
 		    sfp->sm_dev_state != SFP_DEV_UP)
 			break;
 
-		sfp_sm_mod_init(sfp);
+		if (!(sfp->id.ext.diagmon & SFP_DIAGMON_ADDRMODE))
+			sfp_soft_start_poll(sfp);
+
+		sfp_module_tx_enable(sfp);
 
 		/* Initialise the fault clearance retries */
 		sfp->sm_retries = 5;
@@ -2031,7 +2099,10 @@ static void sfp_poll(struct work_struct *work)
 	struct sfp *sfp = container_of(work, struct sfp, poll.work);
 
 	sfp_check_state(sfp);
-	mod_delayed_work(system_wq, &sfp->poll, poll_jiffies);
+
+	if (sfp->state_soft_mask & (SFP_F_LOS | SFP_F_TX_FAULT) ||
+	    sfp->need_poll)
+		mod_delayed_work(system_wq, &sfp->poll, poll_jiffies);
 }
 
 static struct sfp *sfp_alloc(struct device *dev)
@@ -2076,7 +2147,6 @@ static int sfp_probe(struct platform_device *pdev)
 	const struct sff_data *sff;
 	struct i2c_adapter *i2c;
 	struct sfp *sfp;
-	bool poll = false;
 	int err, i;
 
 	sfp = sfp_alloc(&pdev->dev);
@@ -2183,7 +2253,7 @@ static int sfp_probe(struct platform_device *pdev)
 
 		sfp->gpio_irq[i] = gpiod_to_irq(sfp->gpio[i]);
 		if (!sfp->gpio_irq[i]) {
-			poll = true;
+			sfp->need_poll = true;
 			continue;
 		}
 
@@ -2195,11 +2265,11 @@ static int sfp_probe(struct platform_device *pdev)
 						dev_name(sfp->dev), sfp);
 		if (err) {
 			sfp->gpio_irq[i] = 0;
-			poll = true;
+			sfp->need_poll = true;
 		}
 	}
 
-	if (poll)
+	if (sfp->need_poll)
 		mod_delayed_work(system_wq, &sfp->poll, poll_jiffies);
 
 	/* We could have an issue in cases no Tx disable pin is available or
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 3b35efd85bb1..487fd9412d10 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -428,6 +428,10 @@ enum {
 	SFP_TEC_CUR			= 0x6c,
 
 	SFP_STATUS			= 0x6e,
+	SFP_STATUS_TX_DISABLE		= BIT(7),
+	SFP_STATUS_TX_DISABLE_FORCE	= BIT(6),
+	SFP_STATUS_TX_FAULT		= BIT(2),
+	SFP_STATUS_RX_LOS		= BIT(1),
 	SFP_ALARM0			= 0x70,
 	SFP_ALARM0_TEMP_HIGH		= BIT(7),
 	SFP_ALARM0_TEMP_LOW		= BIT(6),
-- 
cgit v1.2.3-59-g8ed1b


From fca3f91cc38ad866c995fb099d961b31cd687849 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 21 Nov 2019 18:03:26 +0800
Subject: net: sched: add vxlan option support to act_tunnel_key

This patch is to allow setting vxlan options using the
act_tunnel_key action. Different from geneve options,
only one option can be set. And also, geneve options
and vxlan options can't be set at the same time.

gbp is the only param for vxlan options:

  # ip link add name vxlan0 type vxlan dstport 0 external
  # tc qdisc add dev eth0 ingress
  # tc filter add dev eth0 protocol ip parent ffff: \
           flower indev eth0 \
              ip_proto udp \
              action tunnel_key \
                  set src_ip 10.0.99.192 \
                  dst_ip 10.0.99.193 \
                  dst_port 6081 \
                  id 11 \
  		  vxlan_opts 01020304 \
          action mirred egress redirect dev vxlan0

v1->v2:
  - add .strict_start_type for enc_opts_policy as Jakub noticed.
  - use Duplicate instead of Wrong in err msg for extack as Jakub
    suggested.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_tunnel_key.h | 13 +++++
 net/sched/act_tunnel_key.c                | 85 ++++++++++++++++++++++++++++++-
 2 files changed, 97 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index 41c8b462c177..f302c2a76953 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -50,6 +50,10 @@ enum {
 						 * TCA_TUNNEL_KEY_ENC_OPTS_
 						 * attributes
 						 */
+	TCA_TUNNEL_KEY_ENC_OPTS_VXLAN,		/* Nested
+						 * TCA_TUNNEL_KEY_ENC_OPTS_
+						 * attributes
+						 */
 	__TCA_TUNNEL_KEY_ENC_OPTS_MAX,
 };
 
@@ -67,4 +71,13 @@ enum {
 #define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \
 	(__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1)
 
+enum {
+	TCA_TUNNEL_KEY_ENC_OPT_VXLAN_UNSPEC,
+	TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP,		/* u32 */
+	__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX,
+};
+
+#define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \
+	(__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1)
+
 #endif
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index cb34e5d57aaa..ff0909b57511 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -10,6 +10,7 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <net/geneve.h>
+#include <net/vxlan.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
@@ -53,7 +54,10 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
 
 static const struct nla_policy
 enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
+	[TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC]	= {
+		.strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN },
 	[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE]	= { .type = NLA_NESTED },
+	[TCA_TUNNEL_KEY_ENC_OPTS_VXLAN]		= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy
@@ -64,6 +68,11 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
 						       .len = 128 },
 };
 
+static const struct nla_policy
+vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+	[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]	   = { .type = NLA_U32 },
+};
+
 static int
 tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
 			   struct netlink_ext_ack *extack)
@@ -116,10 +125,36 @@ tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
 	return opt_len;
 }
 
+static int
+tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla,
+			       vxlan_opt_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) {
+		NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+		return -EINVAL;
+	}
+
+	if (dst) {
+		struct vxlan_metadata *md = dst;
+
+		md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]);
+	}
+
+	return sizeof(struct vxlan_metadata);
+}
+
 static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
 				int dst_len, struct netlink_ext_ack *extack)
 {
-	int err, rem, opt_len, len = nla_len(nla), opts_len = 0;
+	int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0;
 	const struct nlattr *attr, *head = nla_data(nla);
 
 	err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX,
@@ -130,6 +165,10 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
 	nla_for_each_attr(attr, head, len, rem) {
 		switch (nla_type(attr)) {
 		case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
+			if (type && type != TUNNEL_GENEVE_OPT) {
+				NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+				return -EINVAL;
+			}
 			opt_len = tunnel_key_copy_geneve_opt(attr, dst,
 							     dst_len, extack);
 			if (opt_len < 0)
@@ -139,6 +178,19 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
 				dst_len -= opt_len;
 				dst += opt_len;
 			}
+			type = TUNNEL_GENEVE_OPT;
+			break;
+		case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+			if (type) {
+				NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+				return -EINVAL;
+			}
+			opt_len = tunnel_key_copy_vxlan_opt(attr, dst,
+							    dst_len, extack);
+			if (opt_len < 0)
+				return opt_len;
+			opts_len += opt_len;
+			type = TUNNEL_VXLAN_OPT;
 			break;
 		}
 	}
@@ -174,6 +226,14 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
 					    opts_len, extack);
 #else
 		return -EAFNOSUPPORT;
+#endif
+	case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
+#if IS_ENABLED(CONFIG_INET)
+		info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+		return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+					    opts_len, extack);
+#else
+		return -EAFNOSUPPORT;
 #endif
 	default:
 		NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
@@ -451,6 +511,25 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb,
 	return 0;
 }
 
+static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb,
+				      const struct ip_tunnel_info *info)
+{
+	struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1);
+	struct nlattr *start;
+
+	start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN);
+	if (!start)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) {
+		nla_nest_cancel(skb, start);
+		return -EMSGSIZE;
+	}
+
+	nla_nest_end(skb, start);
+	return 0;
+}
+
 static int tunnel_key_opts_dump(struct sk_buff *skb,
 				const struct ip_tunnel_info *info)
 {
@@ -468,6 +547,10 @@ static int tunnel_key_opts_dump(struct sk_buff *skb,
 		err = tunnel_key_geneve_opts_dump(skb, info);
 		if (err)
 			goto err_out;
+	} else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+		err = tunnel_key_vxlan_opts_dump(skb, info);
+		if (err)
+			goto err_out;
 	} else {
 err_out:
 		nla_nest_cancel(skb, start);
-- 
cgit v1.2.3-59-g8ed1b


From e20d4ff2acd7db2ffce64a6ddbdaeec43a8eec19 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 21 Nov 2019 18:03:27 +0800
Subject: net: sched: add erspan option support to act_tunnel_key

This patch is to allow setting erspan options using the
act_tunnel_key action. Different from geneve options,
only one option can be set. And also, geneve options,
vxlan options or erspan options can't be set at the
same time.

Options are expressed as ver:index:dir:hwid, when ver
is set to 1, index will be applied while dir and hwid
will be ignored, and when ver is set to 2, dir and
hwid will be used while index will be ignored.

  # ip link add name erspan1 type erspan external
  # tc qdisc add dev eth0 ingress
  # tc filter add dev eth0 protocol ip parent ffff: \
           flower indev eth0 \
              ip_proto udp \
              action tunnel_key \
                  set src_ip 10.0.99.192 \
                  dst_ip 10.0.99.193 \
                  dst_port 6081 \
                  id 11 \
  		erspan_opts 1:2:0:0 \
          action mirred egress redirect dev erspan1

v1->v2:
  - do the validation when dst is not yet allocated as Jakub suggested.
  - use Duplicate instead of Wrong in err msg for extack.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_tunnel_key.h |  16 ++++
 net/sched/act_tunnel_key.c                | 118 ++++++++++++++++++++++++++++++
 2 files changed, 134 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index f302c2a76953..3f10dc4e7a4b 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -54,6 +54,10 @@ enum {
 						 * TCA_TUNNEL_KEY_ENC_OPTS_
 						 * attributes
 						 */
+	TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN,		/* Nested
+						 * TCA_TUNNEL_KEY_ENC_OPTS_
+						 * attributes
+						 */
 	__TCA_TUNNEL_KEY_ENC_OPTS_MAX,
 };
 
@@ -80,4 +84,16 @@ enum {
 #define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \
 	(__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1)
 
+enum {
+	TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_UNSPEC,
+	TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER,		/* u8 */
+	TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX,		/* be32 */
+	TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR,		/* u8 */
+	TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID,		/* u8 */
+	__TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX,
+};
+
+#define TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX \
+	(__TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX - 1)
+
 #endif
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index ff0909b57511..30b58256d3da 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -11,6 +11,7 @@
 #include <linux/rtnetlink.h>
 #include <net/geneve.h>
 #include <net/vxlan.h>
+#include <net/erspan.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
@@ -58,6 +59,7 @@ enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = {
 		.strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN },
 	[TCA_TUNNEL_KEY_ENC_OPTS_GENEVE]	= { .type = NLA_NESTED },
 	[TCA_TUNNEL_KEY_ENC_OPTS_VXLAN]		= { .type = NLA_NESTED },
+	[TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN]	= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy
@@ -73,6 +75,14 @@ vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = {
 	[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]	   = { .type = NLA_U32 },
 };
 
+static const struct nla_policy
+erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+	[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]	   = { .type = NLA_U8 },
+	[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]	   = { .type = NLA_U32 },
+	[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR]	   = { .type = NLA_U8 },
+	[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]	   = { .type = NLA_U8 },
+};
+
 static int
 tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len,
 			   struct netlink_ext_ack *extack)
@@ -151,6 +161,59 @@ tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len,
 	return sizeof(struct vxlan_metadata);
 }
 
+static int
+tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len,
+			   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1];
+	int err;
+	u8 ver;
+
+	err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla,
+			       erspan_opt_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) {
+		NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+		return -EINVAL;
+	}
+
+	ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]);
+	if (ver == 1) {
+		if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) {
+			NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+			return -EINVAL;
+		}
+	} else if (ver == 2) {
+		if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] ||
+		    !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) {
+			NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+			return -EINVAL;
+		}
+	} else {
+		NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+		return -EINVAL;
+	}
+
+	if (dst) {
+		struct erspan_metadata *md = dst;
+
+		md->version = ver;
+		if (ver == 1) {
+			nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX];
+			md->u.index = nla_get_be32(nla);
+		} else {
+			nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR];
+			md->u.md2.dir = nla_get_u8(nla);
+			nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID];
+			set_hwid(&md->u.md2, nla_get_u8(nla));
+		}
+	}
+
+	return sizeof(struct erspan_metadata);
+}
+
 static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
 				int dst_len, struct netlink_ext_ack *extack)
 {
@@ -192,6 +255,18 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
 			opts_len += opt_len;
 			type = TUNNEL_VXLAN_OPT;
 			break;
+		case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+			if (type) {
+				NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+				return -EINVAL;
+			}
+			opt_len = tunnel_key_copy_erspan_opt(attr, dst,
+							     dst_len, extack);
+			if (opt_len < 0)
+				return opt_len;
+			opts_len += opt_len;
+			type = TUNNEL_ERSPAN_OPT;
+			break;
 		}
 	}
 
@@ -234,6 +309,14 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
 					    opts_len, extack);
 #else
 		return -EAFNOSUPPORT;
+#endif
+	case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
+#if IS_ENABLED(CONFIG_INET)
+		info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+		return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
+					    opts_len, extack);
+#else
+		return -EAFNOSUPPORT;
 #endif
 	default:
 		NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type");
@@ -530,6 +613,37 @@ static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb,
 	return 0;
 }
 
+static int tunnel_key_erspan_opts_dump(struct sk_buff *skb,
+				       const struct ip_tunnel_info *info)
+{
+	struct erspan_metadata *md = (struct erspan_metadata *)(info + 1);
+	struct nlattr *start;
+
+	start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN);
+	if (!start)
+		return -EMSGSIZE;
+
+	if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version))
+		goto err;
+
+	if (md->version == 1 &&
+	    nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+		goto err;
+
+	if (md->version == 2 &&
+	    (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR,
+			md->u.md2.dir) ||
+	     nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID,
+			get_hwid(&md->u.md2))))
+		goto err;
+
+	nla_nest_end(skb, start);
+	return 0;
+err:
+	nla_nest_cancel(skb, start);
+	return -EMSGSIZE;
+}
+
 static int tunnel_key_opts_dump(struct sk_buff *skb,
 				const struct ip_tunnel_info *info)
 {
@@ -551,6 +665,10 @@ static int tunnel_key_opts_dump(struct sk_buff *skb,
 		err = tunnel_key_vxlan_opts_dump(skb, info);
 		if (err)
 			goto err_out;
+	} else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+		err = tunnel_key_erspan_opts_dump(skb, info);
+		if (err)
+			goto err_out;
 	} else {
 err_out:
 		nla_nest_cancel(skb, start);
-- 
cgit v1.2.3-59-g8ed1b


From d8f9dfae49ce4ffb772dc10dd6578dc815b34c12 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 21 Nov 2019 18:03:28 +0800
Subject: net: sched: allow flower to match vxlan options

This patch is to allow matching gbp option in vxlan.

The options can be described in the form GBP/GBP_MASK,
where GBP is represented as a 32bit hexadecimal value.
Different from geneve, only one option can be set. And
also, geneve options and vxlan options can't be set at
the same time.

  # ip link add name vxlan0 type vxlan dstport 0 external
  # tc qdisc add dev vxlan0 ingress
  # tc filter add dev vxlan0 protocol ip parent ffff: \
      flower \
        enc_src_ip 10.0.99.192 \
        enc_dst_ip 10.0.99.193 \
        enc_key_id 11 \
        vxlan_opts 01020304/ffffffff \
        ip_proto udp \
        action mirred egress redirect dev eth0

v1->v2:
  - add .strict_start_type for enc_opts_policy as Jakub noticed.
  - use Duplicate instead of Wrong in err msg for extack as Jakub
    suggested.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  13 ++++++
 net/sched/cls_flower.c       | 109 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index c6ad22f76ede..929825d710e2 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -571,6 +571,10 @@ enum {
 					 * TCA_FLOWER_KEY_ENC_OPT_GENEVE_
 					 * attributes
 					 */
+	TCA_FLOWER_KEY_ENC_OPTS_VXLAN,	/* Nested
+					 * TCA_FLOWER_KEY_ENC_OPT_VXLAN_
+					 * attributes
+					 */
 	__TCA_FLOWER_KEY_ENC_OPTS_MAX,
 };
 
@@ -588,6 +592,15 @@ enum {
 #define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \
 		(__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1)
 
+enum {
+	TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP,		/* u32 */
+	__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \
+		(__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1)
+
 enum {
 	TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
 	TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 74221e3351c3..abc73801df65 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -22,6 +22,7 @@
 #include <net/ip.h>
 #include <net/flow_dissector.h>
 #include <net/geneve.h>
+#include <net/vxlan.h>
 
 #include <net/dst.h>
 #include <net/dst_metadata.h>
@@ -688,7 +689,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 
 static const struct nla_policy
 enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
+	[TCA_FLOWER_KEY_ENC_OPTS_UNSPEC]        = {
+		.strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN },
 	[TCA_FLOWER_KEY_ENC_OPTS_GENEVE]        = { .type = NLA_NESTED },
+	[TCA_FLOWER_KEY_ENC_OPTS_VXLAN]         = { .type = NLA_NESTED },
 };
 
 static const struct nla_policy
@@ -699,6 +703,11 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
 						       .len = 128 },
 };
 
+static const struct nla_policy
+vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = {
+	[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]         = { .type = NLA_U32 },
+};
+
 static void fl_set_key_val(struct nlattr **tb,
 			   void *val, int val_type,
 			   void *mask, int mask_type, int len)
@@ -928,6 +937,41 @@ static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key,
 	return sizeof(struct geneve_opt) + data_len;
 }
 
+static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+			    int depth, int option_len,
+			    struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1];
+	struct vxlan_metadata *md;
+	int err;
+
+	md = (struct vxlan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+	memset(md, 0xff, sizeof(*md));
+
+	if (!depth)
+		return sizeof(*md);
+
+	if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_VXLAN) {
+		NL_SET_ERR_MSG(extack, "Non-vxlan option type for mask");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, nla,
+			       vxlan_opt_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]) {
+		NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp");
+		return -EINVAL;
+	}
+
+	if (tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP])
+		md->gbp = nla_get_u32(tb[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]);
+
+	return sizeof(*md);
+}
+
 static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			  struct fl_flow_key *mask,
 			  struct netlink_ext_ack *extack)
@@ -958,6 +1002,11 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			  nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) {
 		switch (nla_type(nla_opt_key)) {
 		case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
+			if (key->enc_opts.dst_opt_type &&
+			    key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) {
+				NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
+				return -EINVAL;
+			}
 			option_len = 0;
 			key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
 			option_len = fl_set_geneve_opt(nla_opt_key, key,
@@ -983,6 +1032,39 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				return -EINVAL;
 			}
 
+			if (msk_depth)
+				nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+			break;
+		case TCA_FLOWER_KEY_ENC_OPTS_VXLAN:
+			if (key->enc_opts.dst_opt_type) {
+				NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options");
+				return -EINVAL;
+			}
+			option_len = 0;
+			key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+			option_len = fl_set_vxlan_opt(nla_opt_key, key,
+						      key_depth, option_len,
+						      extack);
+			if (option_len < 0)
+				return option_len;
+
+			key->enc_opts.len += option_len;
+			/* At the same time we need to parse through the mask
+			 * in order to verify exact and mask attribute lengths.
+			 */
+			mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+			option_len = fl_set_vxlan_opt(nla_opt_msk, mask,
+						      msk_depth, option_len,
+						      extack);
+			if (option_len < 0)
+				return option_len;
+
+			mask->enc_opts.len += option_len;
+			if (key->enc_opts.len != mask->enc_opts.len) {
+				NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+				return -EINVAL;
+			}
+
 			if (msk_depth)
 				nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
 			break;
@@ -2135,6 +2217,28 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int fl_dump_key_vxlan_opt(struct sk_buff *skb,
+				 struct flow_dissector_key_enc_opts *enc_opts)
+{
+	struct vxlan_metadata *md;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_VXLAN);
+	if (!nest)
+		goto nla_put_failure;
+
+	md = (struct vxlan_metadata *)&enc_opts->data[0];
+	if (nla_put_u32(skb, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, md->gbp))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static int fl_dump_key_ct(struct sk_buff *skb,
 			  struct flow_dissector_key_ct *key,
 			  struct flow_dissector_key_ct *mask)
@@ -2188,6 +2292,11 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
 		if (err)
 			goto nla_put_failure;
 		break;
+	case TUNNEL_VXLAN_OPT:
+		err = fl_dump_key_vxlan_opt(skb, enc_opts);
+		if (err)
+			goto nla_put_failure;
+		break;
 	default:
 		goto nla_put_failure;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 79b1011cb33d166f531a1347a17e6602954e4eb1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 21 Nov 2019 18:03:29 +0800
Subject: net: sched: allow flower to match erspan options

This patch is to allow matching options in erspan.

The options can be described in the form:
VER:INDEX:DIR:HWID/VER:INDEX_MASK:DIR_MASK:HWID_MASK.
When ver is set to 1, index will be applied while dir
and hwid will be ignored, and when ver is set to 2,
dir and hwid will be used while index will be ignored.

Different from geneve, only one option can be set. And
also, geneve options, vxlan options or erspan options
can't be set at the same time.

  # ip link add name erspan1 type erspan external
  # tc qdisc add dev erspan1 ingress
  # tc filter add dev erspan1 protocol ip parent ffff: \
      flower \
        enc_src_ip 10.0.99.192 \
        enc_dst_ip 10.0.99.193 \
        enc_key_id 11 \
        erspan_opts 1:12:0:0/1:ffff:0:0 \
        ip_proto udp \
        action mirred egress redirect dev eth0

v1->v2:
  - improve some err msgs of extack.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h |  16 +++++
 net/sched/cls_flower.c       | 145 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 161 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 929825d710e2..449a63971451 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -575,6 +575,10 @@ enum {
 					 * TCA_FLOWER_KEY_ENC_OPT_VXLAN_
 					 * attributes
 					 */
+	TCA_FLOWER_KEY_ENC_OPTS_ERSPAN,	/* Nested
+					 * TCA_FLOWER_KEY_ENC_OPT_ERSPAN_
+					 * attributes
+					 */
 	__TCA_FLOWER_KEY_ENC_OPTS_MAX,
 };
 
@@ -601,6 +605,18 @@ enum {
 #define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \
 		(__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1)
 
+enum {
+	TCA_FLOWER_KEY_ENC_OPT_ERSPAN_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER,              /* u8 */
+	TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX,            /* be32 */
+	TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR,              /* u8 */
+	TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID,             /* u8 */
+	__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \
+		(__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1)
+
 enum {
 	TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
 	TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index abc73801df65..c307ee1d6ca6 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -23,6 +23,7 @@
 #include <net/flow_dissector.h>
 #include <net/geneve.h>
 #include <net/vxlan.h>
+#include <net/erspan.h>
 
 #include <net/dst.h>
 #include <net/dst_metadata.h>
@@ -693,6 +694,7 @@ enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
 		.strict_start_type = TCA_FLOWER_KEY_ENC_OPTS_VXLAN },
 	[TCA_FLOWER_KEY_ENC_OPTS_GENEVE]        = { .type = NLA_NESTED },
 	[TCA_FLOWER_KEY_ENC_OPTS_VXLAN]         = { .type = NLA_NESTED },
+	[TCA_FLOWER_KEY_ENC_OPTS_ERSPAN]        = { .type = NLA_NESTED },
 };
 
 static const struct nla_policy
@@ -708,6 +710,14 @@ vxlan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP]         = { .type = NLA_U32 },
 };
 
+static const struct nla_policy
+erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = {
+	[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]        = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]      = { .type = NLA_U32 },
+	[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]        = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]       = { .type = NLA_U8 },
+};
+
 static void fl_set_key_val(struct nlattr **tb,
 			   void *val, int val_type,
 			   void *mask, int mask_type, int len)
@@ -972,6 +982,70 @@ static int fl_set_vxlan_opt(const struct nlattr *nla, struct fl_flow_key *key,
 	return sizeof(*md);
 }
 
+static int fl_set_erspan_opt(const struct nlattr *nla, struct fl_flow_key *key,
+			     int depth, int option_len,
+			     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1];
+	struct erspan_metadata *md;
+	int err;
+
+	md = (struct erspan_metadata *)&key->enc_opts.data[key->enc_opts.len];
+	memset(md, 0xff, sizeof(*md));
+	md->version = 1;
+
+	if (!depth)
+		return sizeof(*md);
+
+	if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_ERSPAN) {
+		NL_SET_ERR_MSG(extack, "Non-erspan option type for mask");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, nla,
+			       erspan_opt_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]) {
+		NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver");
+		return -EINVAL;
+	}
+
+	if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER])
+		md->version = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER]);
+
+	if (md->version == 1) {
+		if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+			NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index");
+			return -EINVAL;
+		}
+		if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX]) {
+			nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX];
+			md->u.index = nla_get_be32(nla);
+		}
+	} else if (md->version == 2) {
+		if (!option_len && (!tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR] ||
+				    !tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID])) {
+			NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid");
+			return -EINVAL;
+		}
+		if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR]) {
+			nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR];
+			md->u.md2.dir = nla_get_u8(nla);
+		}
+		if (tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID]) {
+			nla = tb[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID];
+			set_hwid(&md->u.md2, nla_get_u8(nla));
+		}
+	} else {
+		NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect");
+		return -EINVAL;
+	}
+
+	return sizeof(*md);
+}
+
 static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			  struct fl_flow_key *mask,
 			  struct netlink_ext_ack *extack)
@@ -1065,6 +1139,39 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				return -EINVAL;
 			}
 
+			if (msk_depth)
+				nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
+			break;
+		case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN:
+			if (key->enc_opts.dst_opt_type) {
+				NL_SET_ERR_MSG(extack, "Duplicate type for erspan options");
+				return -EINVAL;
+			}
+			option_len = 0;
+			key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+			option_len = fl_set_erspan_opt(nla_opt_key, key,
+						       key_depth, option_len,
+						       extack);
+			if (option_len < 0)
+				return option_len;
+
+			key->enc_opts.len += option_len;
+			/* At the same time we need to parse through the mask
+			 * in order to verify exact and mask attribute lengths.
+			 */
+			mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+			option_len = fl_set_erspan_opt(nla_opt_msk, mask,
+						       msk_depth, option_len,
+						       extack);
+			if (option_len < 0)
+				return option_len;
+
+			mask->enc_opts.len += option_len;
+			if (key->enc_opts.len != mask->enc_opts.len) {
+				NL_SET_ERR_MSG(extack, "Key and mask miss aligned");
+				return -EINVAL;
+			}
+
 			if (msk_depth)
 				nla_opt_msk = nla_next(nla_opt_msk, &msk_depth);
 			break;
@@ -2239,6 +2346,39 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int fl_dump_key_erspan_opt(struct sk_buff *skb,
+				  struct flow_dissector_key_enc_opts *enc_opts)
+{
+	struct erspan_metadata *md;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_ERSPAN);
+	if (!nest)
+		goto nla_put_failure;
+
+	md = (struct erspan_metadata *)&enc_opts->data[0];
+	if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, md->version))
+		goto nla_put_failure;
+
+	if (md->version == 1 &&
+	    nla_put_be32(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index))
+		goto nla_put_failure;
+
+	if (md->version == 2 &&
+	    (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR,
+			md->u.md2.dir) ||
+	     nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID,
+			get_hwid(&md->u.md2))))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static int fl_dump_key_ct(struct sk_buff *skb,
 			  struct flow_dissector_key_ct *key,
 			  struct flow_dissector_key_ct *mask)
@@ -2297,6 +2437,11 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
 		if (err)
 			goto nla_put_failure;
 		break;
+	case TUNNEL_ERSPAN_OPT:
+		err = fl_dump_key_erspan_opt(skb, enc_opts);
+		if (err)
+			goto nla_put_failure;
+		break;
 	default:
 		goto nla_put_failure;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 7599a896f2e46e9c072e02a8299a67d4d2f96675 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 21 Nov 2019 16:58:53 +0100
Subject: audit: Move audit_log_task declaration under CONFIG_AUDITSYSCALL

The 0-DAY found that audit_log_task is not declared under
CONFIG_AUDITSYSCALL which causes compilation error when
it is not defined:

    kernel/bpf/syscall.o: In function `bpf_audit_prog.isra.30':
 >> syscall.c:(.text+0x860): undefined reference to `audit_log_task'

Adding the audit_log_task declaration and stub within
CONFIG_AUDITSYSCALL ifdef.

Fixes: 91e6015b082b ("bpf: Emit audit messages upon successful prog load and unload")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/audit.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index edd006f4597d..18925d924c73 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -159,7 +159,6 @@ extern void		    audit_log_key(struct audit_buffer *ab,
 extern void		    audit_log_link_denied(const char *operation);
 extern void		    audit_log_lost(const char *message);
 
-extern void audit_log_task(struct audit_buffer *ab);
 extern int audit_log_task_context(struct audit_buffer *ab);
 extern void audit_log_task_info(struct audit_buffer *ab);
 
@@ -220,8 +219,6 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key)
 { }
 static inline void audit_log_link_denied(const char *string)
 { }
-static inline void audit_log_task(struct audit_buffer *ab)
-{ }
 static inline int audit_log_task_context(struct audit_buffer *ab)
 {
 	return 0;
@@ -361,6 +358,8 @@ static inline void audit_ptrace(struct task_struct *t)
 		__audit_ptrace(t);
 }
 
+extern void audit_log_task(struct audit_buffer *ab);
+
 				/* Private API (for audit.c only) */
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
@@ -648,6 +647,9 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad)
 
 static inline void audit_ptrace(struct task_struct *t)
 { }
+
+static inline void audit_log_task(struct audit_buffer *ab)
+{ }
 #define audit_n_rules 0
 #define audit_signals 0
 #endif /* CONFIG_AUDITSYSCALL */
-- 
cgit v1.2.3-59-g8ed1b


From f145922ddcaa1cb9688b3d053622c98d9f9a7fff Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 20 Nov 2019 16:23:14 +0800
Subject: net: mscc: ocelot: export ocelot_hwstamp_get/set functions

Export ocelot_hwstamp_get/set functions so that DSA driver
is able to reuse them.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c | 8 ++++----
 include/soc/mscc/ocelot.h          | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 90c46ba763d7..7302724a9261 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1049,15 +1049,14 @@ static int ocelot_get_port_parent_id(struct net_device *dev,
 	return 0;
 }
 
-static int ocelot_hwstamp_get(struct ocelot *ocelot, int port,
-			      struct ifreq *ifr)
+int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr)
 {
 	return copy_to_user(ifr->ifr_data, &ocelot->hwtstamp_config,
 			    sizeof(ocelot->hwtstamp_config)) ? -EFAULT : 0;
 }
+EXPORT_SYMBOL(ocelot_hwstamp_get);
 
-static int ocelot_hwstamp_set(struct ocelot *ocelot, int port,
-			      struct ifreq *ifr)
+int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	struct hwtstamp_config cfg;
@@ -1120,6 +1119,7 @@ static int ocelot_hwstamp_set(struct ocelot *ocelot, int port,
 
 	return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0;
 }
+EXPORT_SYMBOL(ocelot_hwstamp_set);
 
 static int ocelot_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index a836afe8f68e..2bac4bc34cf6 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -533,6 +533,8 @@ int ocelot_fdb_del(struct ocelot *ocelot, int port,
 int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid,
 		    bool untagged);
 int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid);
+int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr);
+int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr);
 int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts);
 void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts);
 
-- 
cgit v1.2.3-59-g8ed1b


From e23a7b3e8daa4be3d91544d8ba210f96d2266de9 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 20 Nov 2019 16:23:15 +0800
Subject: net: mscc: ocelot: convert to use ocelot_get_txtstamp()

The method getting TX timestamp by reading timestamp FIFO and
matching skbs list is common for DSA Felix driver too.
So move code out of ocelot_board.c, convert to use
ocelot_get_txtstamp() function and export it.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c       | 62 ++++++++++++++++++++++++++++++--
 drivers/net/ethernet/mscc/ocelot.h       |  6 ----
 drivers/net/ethernet/mscc/ocelot_board.c | 53 +--------------------------
 include/soc/mscc/ocelot.h                |  9 ++++-
 4 files changed, 69 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 7302724a9261..a58d2ed5b590 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -661,7 +661,8 @@ out:
 	return NETDEV_TX_OK;
 }
 
-void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts)
+static void ocelot_get_hwtimestamp(struct ocelot *ocelot,
+				   struct timespec64 *ts)
 {
 	unsigned long flags;
 	u32 val;
@@ -686,7 +687,64 @@ void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts)
 
 	spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
 }
-EXPORT_SYMBOL(ocelot_get_hwtimestamp);
+
+void ocelot_get_txtstamp(struct ocelot *ocelot)
+{
+	int budget = OCELOT_PTP_QUEUE_SZ;
+
+	while (budget--) {
+		struct skb_shared_hwtstamps shhwtstamps;
+		struct list_head *pos, *tmp;
+		struct sk_buff *skb = NULL;
+		struct ocelot_skb *entry;
+		struct ocelot_port *port;
+		struct timespec64 ts;
+		u32 val, id, txport;
+
+		val = ocelot_read(ocelot, SYS_PTP_STATUS);
+
+		/* Check if a timestamp can be retrieved */
+		if (!(val & SYS_PTP_STATUS_PTP_MESS_VLD))
+			break;
+
+		WARN_ON(val & SYS_PTP_STATUS_PTP_OVFL);
+
+		/* Retrieve the ts ID and Tx port */
+		id = SYS_PTP_STATUS_PTP_MESS_ID_X(val);
+		txport = SYS_PTP_STATUS_PTP_MESS_TXPORT_X(val);
+
+		/* Retrieve its associated skb */
+		port = ocelot->ports[txport];
+
+		list_for_each_safe(pos, tmp, &port->skbs) {
+			entry = list_entry(pos, struct ocelot_skb, head);
+			if (entry->id != id)
+				continue;
+
+			skb = entry->skb;
+
+			list_del(pos);
+			kfree(entry);
+		}
+
+		/* Next ts */
+		ocelot_write(ocelot, SYS_PTP_NXT_PTP_NXT, SYS_PTP_NXT);
+
+		if (unlikely(!skb))
+			continue;
+
+		/* Get the h/w timestamp */
+		ocelot_get_hwtimestamp(ocelot, &ts);
+
+		/* Set the timestamp into the skb */
+		memset(&shhwtstamps, 0, sizeof(shhwtstamps));
+		shhwtstamps.hwtstamp = ktime_set(ts.tv_sec, ts.tv_nsec);
+		skb_tstamp_tx(skb, &shhwtstamps);
+
+		dev_kfree_skb_any(skb);
+	}
+}
+EXPORT_SYMBOL(ocelot_get_txtstamp);
 
 static int ocelot_mc_unsync(struct net_device *dev, const unsigned char *addr)
 {
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index 32fef4f495aa..c259114c48fd 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -74,12 +74,6 @@ struct ocelot_port_private {
 	struct ocelot_port_tc tc;
 };
 
-struct ocelot_skb {
-	struct list_head head;
-	struct sk_buff *skb;
-	u8 id;
-};
-
 u32 ocelot_port_readl(struct ocelot_port *port, u32 reg);
 void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg);
 
diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
index 5541ec26f953..2da8eee27e98 100644
--- a/drivers/net/ethernet/mscc/ocelot_board.c
+++ b/drivers/net/ethernet/mscc/ocelot_board.c
@@ -190,60 +190,9 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
 
 static irqreturn_t ocelot_ptp_rdy_irq_handler(int irq, void *arg)
 {
-	int budget = OCELOT_PTP_QUEUE_SZ;
 	struct ocelot *ocelot = arg;
 
-	while (budget--) {
-		struct skb_shared_hwtstamps shhwtstamps;
-		struct list_head *pos, *tmp;
-		struct sk_buff *skb = NULL;
-		struct ocelot_skb *entry;
-		struct ocelot_port *port;
-		struct timespec64 ts;
-		u32 val, id, txport;
-
-		val = ocelot_read(ocelot, SYS_PTP_STATUS);
-
-		/* Check if a timestamp can be retrieved */
-		if (!(val & SYS_PTP_STATUS_PTP_MESS_VLD))
-			break;
-
-		WARN_ON(val & SYS_PTP_STATUS_PTP_OVFL);
-
-		/* Retrieve the ts ID and Tx port */
-		id = SYS_PTP_STATUS_PTP_MESS_ID_X(val);
-		txport = SYS_PTP_STATUS_PTP_MESS_TXPORT_X(val);
-
-		/* Retrieve its associated skb */
-		port = ocelot->ports[txport];
-
-		list_for_each_safe(pos, tmp, &port->skbs) {
-			entry = list_entry(pos, struct ocelot_skb, head);
-			if (entry->id != id)
-				continue;
-
-			skb = entry->skb;
-
-			list_del(pos);
-			kfree(entry);
-		}
-
-		/* Next ts */
-		ocelot_write(ocelot, SYS_PTP_NXT_PTP_NXT, SYS_PTP_NXT);
-
-		if (unlikely(!skb))
-			continue;
-
-		/* Get the h/w timestamp */
-		ocelot_get_hwtimestamp(ocelot, &ts);
-
-		/* Set the timestamp into the skb */
-		memset(&shhwtstamps, 0, sizeof(shhwtstamps));
-		shhwtstamps.hwtstamp = ktime_set(ts.tv_sec, ts.tv_nsec);
-		skb_tstamp_tx(skb, &shhwtstamps);
-
-		dev_kfree_skb_any(skb);
-	}
+	ocelot_get_txtstamp(ocelot);
 
 	return IRQ_HANDLED;
 }
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 2bac4bc34cf6..1a5cb1b2ac5d 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -406,6 +406,13 @@ struct ocelot_ops {
 	int (*reset)(struct ocelot *ocelot);
 };
 
+struct ocelot_skb {
+	struct list_head head;
+	struct sk_buff *skb;
+	u8 id;
+};
+
+
 struct ocelot_port {
 	struct ocelot			*ocelot;
 
@@ -536,6 +543,6 @@ int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid);
 int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr);
 int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr);
 int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts);
-void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts);
+void ocelot_get_txtstamp(struct ocelot *ocelot);
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 400928bf928be153cddd76d9ac4e39978cb43fd3 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 20 Nov 2019 16:23:16 +0800
Subject: net: mscc: ocelot: convert to use ocelot_port_add_txtstamp_skb()

Convert to use ocelot_port_add_txtstamp_skb() for adding skbs which
require TX timestamp into list. Export it so that DSA Felix driver
could reuse it too.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c | 43 ++++++++++++++++++++++++--------------
 include/soc/mscc/ocelot.h          |  2 ++
 2 files changed, 29 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index a58d2ed5b590..0e96ffab3b05 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -575,6 +575,32 @@ static int ocelot_gen_ifh(u32 *ifh, struct frame_info *info)
 	return 0;
 }
 
+int ocelot_port_add_txtstamp_skb(struct ocelot_port *ocelot_port,
+				 struct sk_buff *skb)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	struct ocelot *ocelot = ocelot_port->ocelot;
+
+	if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP &&
+	    ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) {
+		struct ocelot_skb *oskb =
+			kzalloc(sizeof(struct ocelot_skb), GFP_ATOMIC);
+
+		if (unlikely(!oskb))
+			return -ENOMEM;
+
+		shinfo->tx_flags |= SKBTX_IN_PROGRESS;
+
+		oskb->skb = skb;
+		oskb->id = ocelot_port->ts_id % 4;
+
+		list_add_tail(&oskb->head, &ocelot_port->skbs);
+		return 0;
+	}
+	return -ENODATA;
+}
+EXPORT_SYMBOL(ocelot_port_add_txtstamp_skb);
+
 static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ocelot_port_private *priv = netdev_priv(dev);
@@ -637,26 +663,11 @@ static int ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev)
 	dev->stats.tx_packets++;
 	dev->stats.tx_bytes += skb->len;
 
-	if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP &&
-	    ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) {
-		struct ocelot_skb *oskb =
-			kzalloc(sizeof(struct ocelot_skb), GFP_ATOMIC);
-
-		if (unlikely(!oskb))
-			goto out;
-
-		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
-
-		oskb->skb = skb;
-		oskb->id = ocelot_port->ts_id % 4;
+	if (!ocelot_port_add_txtstamp_skb(ocelot_port, skb)) {
 		ocelot_port->ts_id++;
-
-		list_add_tail(&oskb->head, &ocelot_port->skbs);
-
 		return NETDEV_TX_OK;
 	}
 
-out:
 	dev_kfree_skb_any(skb);
 	return NETDEV_TX_OK;
 }
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 1a5cb1b2ac5d..e1108a5f4f17 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -543,6 +543,8 @@ int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid);
 int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr);
 int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr);
 int ocelot_ptp_gettime64(struct ptp_clock_info *ptp, struct timespec64 *ts);
+int ocelot_port_add_txtstamp_skb(struct ocelot_port *ocelot_port,
+				 struct sk_buff *skb);
 void ocelot_get_txtstamp(struct ocelot *ocelot);
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 1f8ac5703037fdd2e6c960cd35c2b14d18ef3933 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Nov 2019 13:47:33 +0100
Subject: ipv6: add fib6_has_custom_rules() helper

It wraps the namespace field with the same name, to easily
access it regardless of build options.

Suggested-by: David Ahern <dsahern@gmail.com>
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5d1615463138..8ac3a59e5126 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -502,6 +502,11 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
 }
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+static inline bool fib6_has_custom_rules(const struct net *net)
+{
+	return net->ipv6.fib6_has_custom_rules;
+}
+
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
 bool fib6_rule_default(const struct fib_rule *rule);
@@ -527,6 +532,10 @@ static inline bool fib6_rules_early_flow_dissect(struct net *net,
 	return true;
 }
 #else
+static inline bool fib6_has_custom_rules(const struct net *net)
+{
+	return false;
+}
 static inline int               fib6_rules_init(void)
 {
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From b9b33e7c24af1cddc7697056f1664279a40d9a4a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Nov 2019 13:47:34 +0100
Subject: ipv6: keep track of routes using src

Use a per namespace counter, increment it on successful creation
of any route using the source address, decrement it on deletion
of such routes.

This allows us to check easily if the routing decision in the
current namespace depends on the packet source. Will be used
by the next patch.

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    | 30 ++++++++++++++++++++++++++++++
 include/net/netns/ipv6.h |  3 +++
 net/ipv6/ip6_fib.c       |  4 ++++
 net/ipv6/route.c         |  3 +++
 4 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 8ac3a59e5126..f1535f172935 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -90,7 +90,32 @@ struct fib6_gc_args {
 
 #ifndef CONFIG_IPV6_SUBTREES
 #define FIB6_SUBTREE(fn)	NULL
+
+static inline bool fib6_routes_require_src(const struct net *net)
+{
+	return false;
+}
+
+static inline void fib6_routes_require_src_inc(struct net *net) {}
+static inline void fib6_routes_require_src_dec(struct net *net) {}
+
 #else
+
+static inline bool fib6_routes_require_src(const struct net *net)
+{
+	return net->ipv6.fib6_routes_require_src > 0;
+}
+
+static inline void fib6_routes_require_src_inc(struct net *net)
+{
+	net->ipv6.fib6_routes_require_src++;
+}
+
+static inline void fib6_routes_require_src_dec(struct net *net)
+{
+	net->ipv6.fib6_routes_require_src--;
+}
+
 #define FIB6_SUBTREE(fn)	(rcu_dereference_protected((fn)->subtree, 1))
 #endif
 
@@ -212,6 +237,11 @@ static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
 	return ((struct rt6_info *)dst)->rt6i_idev;
 }
 
+static inline bool fib6_requires_src(const struct fib6_info *rt)
+{
+	return rt->fib6_src.plen > 0;
+}
+
 static inline void fib6_clean_expires(struct fib6_info *f6i)
 {
 	f6i->fib6_flags &= ~RTF_EXPIRES;
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 022a0fd1a5a4..5ec054473d81 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -83,6 +83,9 @@ struct netns_ipv6 {
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	unsigned int		fib6_rules_require_fldissect;
 	bool			fib6_has_custom_rules;
+#ifdef CONFIG_IPV6_SUBTREES
+	unsigned int		fib6_routes_require_src;
+#endif
 	struct rt6_info         *ip6_prohibit_entry;
 	struct rt6_info         *ip6_blk_hole_entry;
 	struct fib6_table       *fib6_local_tbl;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f66bc2af4e9d..7bae6a91b487 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1461,6 +1461,8 @@ out:
 		}
 #endif
 		goto failure;
+	} else if (fib6_requires_src(rt)) {
+		fib6_routes_require_src_inc(info->nl_net);
 	}
 	return err;
 
@@ -1933,6 +1935,8 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info)
 		struct fib6_info *cur = rcu_dereference_protected(*rtp,
 					lockdep_is_held(&table->tb6_lock));
 		if (rt == cur) {
+			if (fib6_requires_src(cur))
+				fib6_routes_require_src_dec(info->nl_net);
 			fib6_del_route(table, fn, rtp, info);
 			return 0;
 		}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index edcb52543518..c92b367e058d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6199,6 +6199,9 @@ static int __net_init ip6_route_net_init(struct net *net)
 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
 			 ip6_template_metrics, true);
 	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
+#ifdef CONFIG_IPV6_SUBTREES
+	net->ipv6.fib6_routes_require_src = 0;
+#endif
 #endif
 
 	net->ipv6.sysctl.flush_delay = 0;
-- 
cgit v1.2.3-59-g8ed1b


From c43c3d76c021d8d654ff5cfaad381f14f6beaf1a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Nov 2019 13:47:36 +0100
Subject: ipv4: move fib4_has_custom_rules() helper to public header

So that we can use it in the next patch.
Additionally constify the helper argument.

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h    | 10 ++++++++++
 net/ipv4/fib_frontend.c | 10 ----------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 52b2406a5dfc..b9cba41c6d4f 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -311,6 +311,11 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
 	return err;
 }
 
+static inline bool fib4_has_custom_rules(const struct net *net)
+{
+	return false;
+}
+
 static inline bool fib4_rule_default(const struct fib_rule *rule)
 {
 	return true;
@@ -378,6 +383,11 @@ out:
 	return err;
 }
 
+static inline bool fib4_has_custom_rules(const struct net *net)
+{
+	return net->ipv4.fib_has_custom_rules;
+}
+
 bool fib4_rule_default(const struct fib_rule *rule);
 int fib4_rules_dump(struct net *net, struct notifier_block *nb,
 		    struct netlink_ext_ack *extack);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 71c78d223dfd..577db1d50a24 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -70,11 +70,6 @@ fail:
 	fib_free_table(main_table);
 	return -ENOMEM;
 }
-
-static bool fib4_has_custom_rules(struct net *net)
-{
-	return false;
-}
 #else
 
 struct fib_table *fib_new_table(struct net *net, u32 id)
@@ -131,11 +126,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
 	}
 	return NULL;
 }
-
-static bool fib4_has_custom_rules(struct net *net)
-{
-	return net->ipv4.fib_has_custom_rules;
-}
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
 static void fib_replace_table(struct net *net, struct fib_table *old,
-- 
cgit v1.2.3-59-g8ed1b


From 02b24941619fcce3d280311ac73b1e461552e9c8 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Nov 2019 13:47:37 +0100
Subject: ipv4: use dst hint for ipv4 list receive

This is alike the previous change, with some additional ipv4 specific
quirk. Even when using the route hint we still have to do perform
additional per packet checks about source address validity: a new
helper is added to wrap them.

Hints are explicitly disabled if the destination is a local broadcast,
that keeps the code simple and local broadcast are a slower path anyway.

UDP flood performances vs recvmmsg() receiver:

vanilla		patched		delta
Kpps		Kpps		%
1683		1871		+11

In the worst case scenario - each packet has a different
destination address - the performance delta is within noise
range.

v3 -> v4:
 - re-enable hints for forward

v2 -> v3:
 - really fix build (sic) and hint usage check
 - use fib4_has_custom_rules() helpers (David A.)
 - add ip_extract_route_hint() helper (Edward C.)
 - use prev skb as hint instead of copying data (Willem)

v1 -> v2:
 - fix build issue with !CONFIG_IP_MULTIPLE_TABLES

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  4 ++++
 net/ipv4/ip_input.c | 35 +++++++++++++++++++++++++++++++----
 net/ipv4/route.c    | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 6c516840380d..a9c60fc68e36 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -185,6 +185,10 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src,
 		       u8 tos, struct net_device *devin,
 		       struct fib_result *res);
 
+int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src,
+		      u8 tos, struct net_device *devin,
+		      const struct sk_buff *hint);
+
 static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 				 u8 tos, struct net_device *devin)
 {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 24a95126e698..aa438c6758a7 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -302,16 +302,31 @@ drop:
 	return true;
 }
 
+static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
+			    const struct sk_buff *hint)
+{
+	return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
+	       ip_hdr(hint)->tos == iph->tos;
+}
+
 INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *));
 INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *));
 static int ip_rcv_finish_core(struct net *net, struct sock *sk,
-			      struct sk_buff *skb, struct net_device *dev)
+			      struct sk_buff *skb, struct net_device *dev,
+			      const struct sk_buff *hint)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	int (*edemux)(struct sk_buff *skb);
 	struct rtable *rt;
 	int err;
 
+	if (ip_can_use_hint(skb, iph, hint)) {
+		err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
+					dev, hint);
+		if (unlikely(err))
+			goto drop_error;
+	}
+
 	if (net->ipv4.sysctl_ip_early_demux &&
 	    !skb_dst(skb) &&
 	    !skb->sk &&
@@ -408,7 +423,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	if (!skb)
 		return NET_RX_SUCCESS;
 
-	ret = ip_rcv_finish_core(net, sk, skb, dev);
+	ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
 	if (ret != NET_RX_DROP)
 		ret = dst_input(skb);
 	return ret;
@@ -535,11 +550,20 @@ static void ip_sublist_rcv_finish(struct list_head *head)
 	}
 }
 
+static struct sk_buff *ip_extract_route_hint(const struct net *net,
+					     struct sk_buff *skb, int rt_type)
+{
+	if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST)
+		return NULL;
+
+	return skb;
+}
+
 static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 			       struct list_head *head)
 {
+	struct sk_buff *skb, *next, *hint = NULL;
 	struct dst_entry *curr_dst = NULL;
-	struct sk_buff *skb, *next;
 	struct list_head sublist;
 
 	INIT_LIST_HEAD(&sublist);
@@ -554,11 +578,14 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		skb = l3mdev_ip_rcv(skb);
 		if (!skb)
 			continue;
-		if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP)
+		if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
 			continue;
 
 		dst = skb_dst(skb);
 		if (curr_dst != dst) {
+			hint = ip_extract_route_hint(net, skb,
+					       ((struct rtable *)dst)->rt_type);
+
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
 				ip_sublist_rcv_finish(&sublist);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dcc4fa10138d..f88c93c38f11 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2019,10 +2019,52 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 }
 
+/* Implements all the saddr-related checks as ip_route_input_slow(),
+ * assuming daddr is valid and the destination is not a local broadcast one.
+ * Uses the provided hint instead of performing a route lookup.
+ */
+int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		      u8 tos, struct net_device *dev,
+		      const struct sk_buff *hint)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct rtable *rt = (struct rtable *)hint;
+	struct net *net = dev_net(dev);
+	int err = -EINVAL;
+	u32 tag = 0;
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+		goto martian_source;
+
+	if (ipv4_is_zeronet(saddr))
+		goto martian_source;
+
+	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+		goto martian_source;
+
+	if (rt->rt_type != RTN_LOCAL)
+		goto skip_validate_source;
+
+	tos &= IPTOS_RT_MASK;
+	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
+	if (err < 0)
+		goto martian_source;
+
+skip_validate_source:
+	skb_dst_copy(skb, hint);
+	return 0;
+
+martian_source:
+	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+	return err;
+}
+
 /*
  *	NOTE. We drop all the packets that has local source
  *	addresses, because every properly looped back packet
  *	must have correct destination already attached by output routine.
+ *	Changes in the enforced policies must be applied also to
+ *	ip_route_use_hint().
  *
  *	Such approach solves two big problems:
  *	1. Not simplex devices are handled properly.
-- 
cgit v1.2.3-59-g8ed1b


From db3e1c40cf2f973fbdd52ae0b59a9472b1c04f4a Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Mon, 18 Nov 2019 22:06:08 -0800
Subject: mac80211: Import airtime calculation code from mt76
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Felix recently added code to calculate airtime of packets to the mt76
driver. Import this into mac80211 so we can use it for airtime queue limit
calculations.

The airtime.c file is copied verbatim from the mt76 driver, and adjusted to
be usable in mac80211. This involves:

- Switching to mac80211 data structures.
- Adding support for 160 MHz channels and HE mode.
- Moving the symbol and duration calculations around a bit to avoid
  rounding with the higher rates and longer symbol times used for HE rates.

The per-rate TX rate calculation is also split out to its own function so
it can be used directly for the AQL calculations later.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20191119060610.76681-3-kyan@google.com
[fix HE_GROUP_IDX() to use 3 * bw, since there are 3 _gi values]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  29 +++
 net/mac80211/Makefile      |   3 +-
 net/mac80211/airtime.c     | 597 +++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |   4 +
 4 files changed, 632 insertions(+), 1 deletion(-)
 create mode 100644 net/mac80211/airtime.c

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c643a19dce96..6fc26a051ba0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6424,4 +6424,33 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif,
 			      struct cfg80211_nan_match_params *match,
 			      gfp_t gfp);
 
+/**
+ * ieee80211_calc_rx_airtime - calculate estimated transmission airtime for RX.
+ *
+ * This function calculates the estimated airtime usage of a frame based on the
+ * rate information in the RX status struct and the frame length.
+ *
+ * @hw: pointer as obtained from ieee80211_alloc_hw()
+ * @status: &struct ieee80211_rx_status containing the transmission rate
+ *          information.
+ * @len: frame length in bytes
+ */
+u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
+			      struct ieee80211_rx_status *status,
+			      int len);
+
+/**
+ * ieee80211_calc_tx_airtime - calculate estimated transmission airtime for TX.
+ *
+ * This function calculates the estimated airtime usage of a frame based on the
+ * rate information in the TX info struct and the frame length.
+ *
+ * @hw: pointer as obtained from ieee80211_alloc_hw()
+ * @info: &struct ieee80211_tx_info of the frame.
+ * @len: frame length in bytes
+ */
+u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
+			      struct ieee80211_tx_info *info,
+			      int len);
+
 #endif /* MAC80211_H */
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 4f03ebe732fa..6cbb1286d6c0 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -32,7 +32,8 @@ mac80211-y := \
 	chan.o \
 	trace.o mlme.o \
 	tdls.o \
-	ocb.o
+	ocb.o \
+	airtime.o
 
 mac80211-$(CONFIG_MAC80211_LEDS) += led.o
 mac80211-$(CONFIG_MAC80211_DEBUGFS) += \
diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
new file mode 100644
index 000000000000..63cb0028b02d
--- /dev/null
+++ b/net/mac80211/airtime.c
@@ -0,0 +1,597 @@
+// SPDX-License-Identifier: ISC
+/*
+ * Copyright (C) 2019 Felix Fietkau <nbd@nbd.name>
+ */
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+
+#define AVG_PKT_SIZE	1024
+
+/* Number of bits for an average sized packet */
+#define MCS_NBITS (AVG_PKT_SIZE << 3)
+
+/* Number of kilo-symbols (symbols * 1024) for a packet with (bps) bits per
+ * symbol. We use k-symbols to avoid rounding in the _TIME macros below.
+ */
+#define MCS_N_KSYMS(bps) DIV_ROUND_UP(MCS_NBITS << 10, (bps))
+
+/* Transmission time (in 1024 * usec) for a packet containing (ksyms) * 1024
+ * symbols.
+ */
+#define MCS_SYMBOL_TIME(sgi, ksyms)					\
+	(sgi ?								\
+	  ((ksyms) * 4 * 18) / 20 :		/* 3.6 us per sym */	\
+	  ((ksyms) * 4)			/* 4.0 us per sym */	\
+	)
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define MCS_DURATION(streams, sgi, bps) \
+	((u32)MCS_SYMBOL_TIME(sgi, MCS_N_KSYMS((streams) * (bps))))
+
+#define MCS_DURATION_S(shift, streams, sgi, bps)		\
+	((u16)((MCS_DURATION(streams, sgi, bps) >> shift)))
+
+/* These should match the values in enum nl80211_he_gi */
+#define HE_GI_08 0
+#define HE_GI_16 1
+#define HE_GI_32 2
+
+/* Transmission time (1024 usec) for a packet containing (ksyms) * k-symbols */
+#define HE_SYMBOL_TIME(gi, ksyms)					\
+	(gi == HE_GI_08 ?						\
+	 ((ksyms) * 16 * 17) / 20 :		/* 13.6 us per sym */	\
+	 (gi == HE_GI_16 ?						\
+	  ((ksyms) * 16 * 18) / 20 :		/* 14.4 us per sym */	\
+	  ((ksyms) * 16)			/* 16.0 us per sym */	\
+	 ))
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define HE_DURATION(streams, gi, bps) \
+	((u32)HE_SYMBOL_TIME(gi, MCS_N_KSYMS((streams) * (bps))))
+
+#define HE_DURATION_S(shift, streams, gi, bps)		\
+	(HE_DURATION(streams, gi, bps) >> shift)
+
+#define BW_20			0
+#define BW_40			1
+#define BW_80			2
+#define BW_160			3
+
+/*
+ * Define group sort order: HT40 -> SGI -> #streams
+ */
+#define IEEE80211_MAX_STREAMS		4
+#define IEEE80211_HT_STREAM_GROUPS	4 /* BW(=2) * SGI(=2) */
+#define IEEE80211_VHT_STREAM_GROUPS	8 /* BW(=4) * SGI(=2) */
+
+#define IEEE80211_HE_MAX_STREAMS	8
+#define IEEE80211_HE_STREAM_GROUPS	12 /* BW(=4) * GI(=3) */
+
+#define IEEE80211_HT_GROUPS_NB	(IEEE80211_MAX_STREAMS *	\
+				 IEEE80211_HT_STREAM_GROUPS)
+#define IEEE80211_VHT_GROUPS_NB	(IEEE80211_MAX_STREAMS *	\
+					 IEEE80211_VHT_STREAM_GROUPS)
+#define IEEE80211_HE_GROUPS_NB	(IEEE80211_HE_MAX_STREAMS *	\
+				 IEEE80211_HE_STREAM_GROUPS)
+#define IEEE80211_GROUPS_NB	(IEEE80211_HT_GROUPS_NB +	\
+				 IEEE80211_VHT_GROUPS_NB +	\
+				 IEEE80211_HE_GROUPS_NB)
+
+#define IEEE80211_HT_GROUP_0	0
+#define IEEE80211_VHT_GROUP_0	(IEEE80211_HT_GROUP_0 + IEEE80211_HT_GROUPS_NB)
+#define IEEE80211_HE_GROUP_0	(IEEE80211_VHT_GROUP_0 + IEEE80211_VHT_GROUPS_NB)
+
+#define MCS_GROUP_RATES		12
+
+#define HT_GROUP_IDX(_streams, _sgi, _ht40)	\
+	IEEE80211_HT_GROUP_0 +			\
+	IEEE80211_MAX_STREAMS * 2 * _ht40 +	\
+	IEEE80211_MAX_STREAMS * _sgi +		\
+	_streams - 1
+
+#define _MAX(a, b) (((a)>(b))?(a):(b))
+
+#define GROUP_SHIFT(duration)						\
+	_MAX(0, 16 - __builtin_clz(duration))
+
+/* MCS rate information for an MCS group */
+#define __MCS_GROUP(_streams, _sgi, _ht40, _s)				\
+	[HT_GROUP_IDX(_streams, _sgi, _ht40)] = {			\
+	.shift = _s,							\
+	.duration = {							\
+		MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 54 : 26),	\
+		MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 108 : 52),	\
+		MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 162 : 78),	\
+		MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 216 : 104),	\
+		MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 324 : 156),	\
+		MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 432 : 208),	\
+		MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 486 : 234),	\
+		MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 540 : 260)	\
+	}								\
+}
+
+#define MCS_GROUP_SHIFT(_streams, _sgi, _ht40)				\
+	GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26))
+
+#define MCS_GROUP(_streams, _sgi, _ht40)				\
+	__MCS_GROUP(_streams, _sgi, _ht40,				\
+		    MCS_GROUP_SHIFT(_streams, _sgi, _ht40))
+
+#define VHT_GROUP_IDX(_streams, _sgi, _bw)				\
+	(IEEE80211_VHT_GROUP_0 +					\
+	 IEEE80211_MAX_STREAMS * 2 * (_bw) +				\
+	 IEEE80211_MAX_STREAMS * (_sgi) +				\
+	 (_streams) - 1)
+
+#define BW2VBPS(_bw, r4, r3, r2, r1)					\
+	(_bw == BW_160 ? r4 : _bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1)
+
+#define __VHT_GROUP(_streams, _sgi, _bw, _s)				\
+	[VHT_GROUP_IDX(_streams, _sgi, _bw)] = {			\
+	.shift = _s,							\
+	.duration = {							\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw,  234,  117,  54,  26)),	\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw,  468,  234, 108,  52)),	\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw,  702,  351, 162,  78)),	\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw,  936,  468, 216, 104)),	\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw, 1404,  702, 324, 156)),	\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw, 1872,  936, 432, 208)),	\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw, 2106, 1053, 486, 234)),	\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw, 2340, 1170, 540, 260)),	\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw, 2808, 1404, 648, 312)),	\
+		MCS_DURATION_S(_s, _streams, _sgi,			\
+			       BW2VBPS(_bw, 3120, 1560, 720, 346))	\
+        }								\
+}
+
+#define VHT_GROUP_SHIFT(_streams, _sgi, _bw)				\
+	GROUP_SHIFT(MCS_DURATION(_streams, _sgi,			\
+				 BW2VBPS(_bw, 243, 117,  54,  26)))
+
+#define VHT_GROUP(_streams, _sgi, _bw)					\
+	__VHT_GROUP(_streams, _sgi, _bw,				\
+		    VHT_GROUP_SHIFT(_streams, _sgi, _bw))
+
+
+#define HE_GROUP_IDX(_streams, _gi, _bw)				\
+	(IEEE80211_HE_GROUP_0 +					\
+	 IEEE80211_HE_MAX_STREAMS * 3 * (_bw) +			\
+	 IEEE80211_HE_MAX_STREAMS * (_gi) +				\
+	 (_streams) - 1)
+
+#define __HE_GROUP(_streams, _gi, _bw, _s)				\
+	[HE_GROUP_IDX(_streams, _gi, _bw)] = {			\
+	.shift = _s,							\
+	.duration = {							\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw,   979,  489,  230,  115)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw,  1958,  979,  475,  230)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw,  2937, 1468,  705,  345)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw,  3916, 1958,  936,  475)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw,  5875, 2937, 1411,  705)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw,  7833, 3916, 1872,  936)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw,  8827, 4406, 2102, 1051)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw,  9806, 4896, 2347, 1166)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw, 11764, 5875, 2808, 1411)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw, 13060, 6523, 3124, 1555)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw, 14702, 7344, 3513, 1756)),	\
+		HE_DURATION_S(_s, _streams, _gi,			\
+			      BW2VBPS(_bw, 16329, 8164, 3902, 1944))	\
+        }								\
+}
+
+#define HE_GROUP_SHIFT(_streams, _gi, _bw)				\
+	GROUP_SHIFT(HE_DURATION(_streams, _gi,			\
+				BW2VBPS(_bw,   979,  489,  230,  115)))
+
+#define HE_GROUP(_streams, _gi, _bw)					\
+	__HE_GROUP(_streams, _gi, _bw,				\
+		   HE_GROUP_SHIFT(_streams, _gi, _bw))
+struct mcs_group {
+	u8 shift;
+	u16 duration[MCS_GROUP_RATES];
+};
+
+static const struct mcs_group airtime_mcs_groups[] = {
+	MCS_GROUP(1, 0, BW_20),
+	MCS_GROUP(2, 0, BW_20),
+	MCS_GROUP(3, 0, BW_20),
+	MCS_GROUP(4, 0, BW_20),
+
+	MCS_GROUP(1, 1, BW_20),
+	MCS_GROUP(2, 1, BW_20),
+	MCS_GROUP(3, 1, BW_20),
+	MCS_GROUP(4, 1, BW_20),
+
+	MCS_GROUP(1, 0, BW_40),
+	MCS_GROUP(2, 0, BW_40),
+	MCS_GROUP(3, 0, BW_40),
+	MCS_GROUP(4, 0, BW_40),
+
+	MCS_GROUP(1, 1, BW_40),
+	MCS_GROUP(2, 1, BW_40),
+	MCS_GROUP(3, 1, BW_40),
+	MCS_GROUP(4, 1, BW_40),
+
+	VHT_GROUP(1, 0, BW_20),
+	VHT_GROUP(2, 0, BW_20),
+	VHT_GROUP(3, 0, BW_20),
+	VHT_GROUP(4, 0, BW_20),
+
+	VHT_GROUP(1, 1, BW_20),
+	VHT_GROUP(2, 1, BW_20),
+	VHT_GROUP(3, 1, BW_20),
+	VHT_GROUP(4, 1, BW_20),
+
+	VHT_GROUP(1, 0, BW_40),
+	VHT_GROUP(2, 0, BW_40),
+	VHT_GROUP(3, 0, BW_40),
+	VHT_GROUP(4, 0, BW_40),
+
+	VHT_GROUP(1, 1, BW_40),
+	VHT_GROUP(2, 1, BW_40),
+	VHT_GROUP(3, 1, BW_40),
+	VHT_GROUP(4, 1, BW_40),
+
+	VHT_GROUP(1, 0, BW_80),
+	VHT_GROUP(2, 0, BW_80),
+	VHT_GROUP(3, 0, BW_80),
+	VHT_GROUP(4, 0, BW_80),
+
+	VHT_GROUP(1, 1, BW_80),
+	VHT_GROUP(2, 1, BW_80),
+	VHT_GROUP(3, 1, BW_80),
+	VHT_GROUP(4, 1, BW_80),
+
+	VHT_GROUP(1, 0, BW_160),
+	VHT_GROUP(2, 0, BW_160),
+	VHT_GROUP(3, 0, BW_160),
+	VHT_GROUP(4, 0, BW_160),
+
+	VHT_GROUP(1, 1, BW_160),
+	VHT_GROUP(2, 1, BW_160),
+	VHT_GROUP(3, 1, BW_160),
+	VHT_GROUP(4, 1, BW_160),
+
+	HE_GROUP(1, HE_GI_08, BW_20),
+	HE_GROUP(2, HE_GI_08, BW_20),
+	HE_GROUP(3, HE_GI_08, BW_20),
+	HE_GROUP(4, HE_GI_08, BW_20),
+	HE_GROUP(5, HE_GI_08, BW_20),
+	HE_GROUP(6, HE_GI_08, BW_20),
+	HE_GROUP(7, HE_GI_08, BW_20),
+	HE_GROUP(8, HE_GI_08, BW_20),
+
+	HE_GROUP(1, HE_GI_16, BW_20),
+	HE_GROUP(2, HE_GI_16, BW_20),
+	HE_GROUP(3, HE_GI_16, BW_20),
+	HE_GROUP(4, HE_GI_16, BW_20),
+	HE_GROUP(5, HE_GI_16, BW_20),
+	HE_GROUP(6, HE_GI_16, BW_20),
+	HE_GROUP(7, HE_GI_16, BW_20),
+	HE_GROUP(8, HE_GI_16, BW_20),
+
+	HE_GROUP(1, HE_GI_32, BW_20),
+	HE_GROUP(2, HE_GI_32, BW_20),
+	HE_GROUP(3, HE_GI_32, BW_20),
+	HE_GROUP(4, HE_GI_32, BW_20),
+	HE_GROUP(5, HE_GI_32, BW_20),
+	HE_GROUP(6, HE_GI_32, BW_20),
+	HE_GROUP(7, HE_GI_32, BW_20),
+	HE_GROUP(8, HE_GI_32, BW_20),
+
+	HE_GROUP(1, HE_GI_08, BW_40),
+	HE_GROUP(2, HE_GI_08, BW_40),
+	HE_GROUP(3, HE_GI_08, BW_40),
+	HE_GROUP(4, HE_GI_08, BW_40),
+	HE_GROUP(5, HE_GI_08, BW_40),
+	HE_GROUP(6, HE_GI_08, BW_40),
+	HE_GROUP(7, HE_GI_08, BW_40),
+	HE_GROUP(8, HE_GI_08, BW_40),
+
+	HE_GROUP(1, HE_GI_16, BW_40),
+	HE_GROUP(2, HE_GI_16, BW_40),
+	HE_GROUP(3, HE_GI_16, BW_40),
+	HE_GROUP(4, HE_GI_16, BW_40),
+	HE_GROUP(5, HE_GI_16, BW_40),
+	HE_GROUP(6, HE_GI_16, BW_40),
+	HE_GROUP(7, HE_GI_16, BW_40),
+	HE_GROUP(8, HE_GI_16, BW_40),
+
+	HE_GROUP(1, HE_GI_32, BW_40),
+	HE_GROUP(2, HE_GI_32, BW_40),
+	HE_GROUP(3, HE_GI_32, BW_40),
+	HE_GROUP(4, HE_GI_32, BW_40),
+	HE_GROUP(5, HE_GI_32, BW_40),
+	HE_GROUP(6, HE_GI_32, BW_40),
+	HE_GROUP(7, HE_GI_32, BW_40),
+	HE_GROUP(8, HE_GI_32, BW_40),
+
+	HE_GROUP(1, HE_GI_08, BW_80),
+	HE_GROUP(2, HE_GI_08, BW_80),
+	HE_GROUP(3, HE_GI_08, BW_80),
+	HE_GROUP(4, HE_GI_08, BW_80),
+	HE_GROUP(5, HE_GI_08, BW_80),
+	HE_GROUP(6, HE_GI_08, BW_80),
+	HE_GROUP(7, HE_GI_08, BW_80),
+	HE_GROUP(8, HE_GI_08, BW_80),
+
+	HE_GROUP(1, HE_GI_16, BW_80),
+	HE_GROUP(2, HE_GI_16, BW_80),
+	HE_GROUP(3, HE_GI_16, BW_80),
+	HE_GROUP(4, HE_GI_16, BW_80),
+	HE_GROUP(5, HE_GI_16, BW_80),
+	HE_GROUP(6, HE_GI_16, BW_80),
+	HE_GROUP(7, HE_GI_16, BW_80),
+	HE_GROUP(8, HE_GI_16, BW_80),
+
+	HE_GROUP(1, HE_GI_32, BW_80),
+	HE_GROUP(2, HE_GI_32, BW_80),
+	HE_GROUP(3, HE_GI_32, BW_80),
+	HE_GROUP(4, HE_GI_32, BW_80),
+	HE_GROUP(5, HE_GI_32, BW_80),
+	HE_GROUP(6, HE_GI_32, BW_80),
+	HE_GROUP(7, HE_GI_32, BW_80),
+	HE_GROUP(8, HE_GI_32, BW_80),
+
+	HE_GROUP(1, HE_GI_08, BW_160),
+	HE_GROUP(2, HE_GI_08, BW_160),
+	HE_GROUP(3, HE_GI_08, BW_160),
+	HE_GROUP(4, HE_GI_08, BW_160),
+	HE_GROUP(5, HE_GI_08, BW_160),
+	HE_GROUP(6, HE_GI_08, BW_160),
+	HE_GROUP(7, HE_GI_08, BW_160),
+	HE_GROUP(8, HE_GI_08, BW_160),
+
+	HE_GROUP(1, HE_GI_16, BW_160),
+	HE_GROUP(2, HE_GI_16, BW_160),
+	HE_GROUP(3, HE_GI_16, BW_160),
+	HE_GROUP(4, HE_GI_16, BW_160),
+	HE_GROUP(5, HE_GI_16, BW_160),
+	HE_GROUP(6, HE_GI_16, BW_160),
+	HE_GROUP(7, HE_GI_16, BW_160),
+	HE_GROUP(8, HE_GI_16, BW_160),
+
+	HE_GROUP(1, HE_GI_32, BW_160),
+	HE_GROUP(2, HE_GI_32, BW_160),
+	HE_GROUP(3, HE_GI_32, BW_160),
+	HE_GROUP(4, HE_GI_32, BW_160),
+	HE_GROUP(5, HE_GI_32, BW_160),
+	HE_GROUP(6, HE_GI_32, BW_160),
+	HE_GROUP(7, HE_GI_32, BW_160),
+	HE_GROUP(8, HE_GI_32, BW_160),
+};
+
+static u32
+ieee80211_calc_legacy_rate_duration(u16 bitrate, bool short_pre,
+				    bool cck, int len)
+{
+	u32 duration;
+
+	if (cck) {
+		duration = 144 + 48; /* preamble + PLCP */
+		if (short_pre)
+			duration >>= 1;
+
+		duration += 10; /* SIFS */
+	} else {
+		duration = 20 + 16; /* premable + SIFS */
+	}
+
+	len <<= 3;
+	duration += (len * 10) / bitrate;
+
+	return duration;
+}
+
+u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
+			      struct ieee80211_rx_status *status,
+			      int len)
+{
+	struct ieee80211_supported_band *sband;
+	const struct ieee80211_rate *rate;
+	bool sgi = status->enc_flags & RX_ENC_FLAG_SHORT_GI;
+	bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE;
+	int bw, streams;
+	int group, idx;
+	u32 duration;
+	bool cck;
+
+	switch (status->bw) {
+	case RATE_INFO_BW_20:
+		bw = BW_20;
+		break;
+	case RATE_INFO_BW_40:
+		bw = BW_40;
+		break;
+	case RATE_INFO_BW_80:
+		bw = BW_80;
+		break;
+	case RATE_INFO_BW_160:
+		bw = BW_160;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	switch (status->encoding) {
+	case RX_ENC_LEGACY:
+		if (WARN_ON_ONCE(status->band > NL80211_BAND_5GHZ))
+			return 0;
+
+		sband = hw->wiphy->bands[status->band];
+		if (!sband || status->rate_idx > sband->n_bitrates)
+			return 0;
+
+		rate = &sband->bitrates[status->rate_idx];
+		cck = rate->flags & IEEE80211_RATE_MANDATORY_B;
+
+		return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp,
+							   cck, len);
+
+	case RX_ENC_VHT:
+		streams = status->nss;
+		idx = status->rate_idx;
+		group = VHT_GROUP_IDX(streams, sgi, bw);
+		break;
+	case RX_ENC_HT:
+		streams = ((status->rate_idx >> 3) & 3) + 1;
+		idx = status->rate_idx & 7;
+		group = HT_GROUP_IDX(streams, sgi, bw);
+		break;
+	case RX_ENC_HE:
+		streams = status->nss;
+		idx = status->rate_idx;
+		group = HE_GROUP_IDX(streams, status->he_gi, bw);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return 0;
+	}
+
+	if (WARN_ON_ONCE((status->encoding != RX_ENC_HE && streams > 4) ||
+			 (status->encoding == RX_ENC_HE && streams > 8)))
+		return 0;
+
+	duration = airtime_mcs_groups[group].duration[idx];
+	duration <<= airtime_mcs_groups[group].shift;
+	duration *= len;
+	duration /= AVG_PKT_SIZE;
+	duration /= 1024;
+
+	duration += 36 + (streams << 2);
+
+	return duration;
+}
+EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime);
+
+static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw,
+					  struct ieee80211_tx_rate *rate,
+					  u8 band, int len)
+{
+	struct ieee80211_rx_status stat = {
+		.band = band,
+	};
+
+	if (rate->idx < 0 || !rate->count)
+		return 0;
+
+	if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+		stat.bw = RATE_INFO_BW_80;
+	else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+		stat.bw = RATE_INFO_BW_40;
+	else
+		stat.bw = RATE_INFO_BW_20;
+
+	stat.enc_flags = 0;
+	if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)
+		stat.enc_flags |= RX_ENC_FLAG_SHORTPRE;
+	if (rate->flags & IEEE80211_TX_RC_SHORT_GI)
+		stat.enc_flags |= RX_ENC_FLAG_SHORT_GI;
+
+	stat.rate_idx = rate->idx;
+	if (rate->flags & IEEE80211_TX_RC_VHT_MCS) {
+		stat.encoding = RX_ENC_VHT;
+		stat.rate_idx = ieee80211_rate_get_vht_mcs(rate);
+		stat.nss = ieee80211_rate_get_vht_nss(rate);
+	} else if (rate->flags & IEEE80211_TX_RC_MCS) {
+		stat.encoding = RX_ENC_HT;
+	} else {
+		stat.encoding = RX_ENC_LEGACY;
+	}
+
+	return ieee80211_calc_rx_airtime(hw, &stat, len);
+}
+
+u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
+			      struct ieee80211_tx_info *info,
+			      int len)
+{
+	u32 duration = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(info->status.rates); i++) {
+		struct ieee80211_tx_rate *rate = &info->status.rates[i];
+		u32 cur_duration;
+
+		cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate,
+							      info->band, len);
+		if (!cur_duration)
+			break;
+
+		duration += cur_duration * rate->count;
+	}
+
+	return duration;
+}
+EXPORT_SYMBOL_GPL(ieee80211_calc_tx_airtime);
+
+u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
+				       struct ieee80211_sta *pubsta,
+				       int len)
+{
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_chanctx_conf *conf;
+	int rateidx, shift = 0;
+	bool cck, short_pream;
+	u32 basic_rates;
+	u8 band = 0;
+	u16 rate;
+
+	len += 38; /* Ethernet header length */
+
+	conf = rcu_dereference(vif->chanctx_conf);
+	if (conf) {
+		band = conf->def.chan->band;
+		shift = ieee80211_chandef_get_shift(&conf->def);
+	}
+
+	if (pubsta) {
+		struct sta_info *sta = container_of(pubsta, struct sta_info,
+						    sta);
+
+		return ieee80211_calc_tx_airtime_rate(hw,
+						      &sta->tx_stats.last_rate,
+						      band, len);
+	}
+
+	if (!conf)
+		return 0;
+
+	/* No station to get latest rate from, so calculate the worst-case
+	 * duration using the lowest configured basic rate.
+	 */
+	sband = hw->wiphy->bands[band];
+
+	basic_rates = vif->bss_conf.basic_rates;
+	short_pream = vif->bss_conf.use_short_preamble;
+
+	rateidx = basic_rates ? ffs(basic_rates) - 1 : 0;
+	rate = sband->bitrates[rateidx].bitrate << shift;
+	cck = sband->bitrates[rateidx].flags & IEEE80211_RATE_MANDATORY_B;
+
+	return ieee80211_calc_legacy_rate_duration(rate, short_pream, cck, len);
+}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 05406e9c05b3..225ea4e3cd76 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2249,6 +2249,10 @@ const char *ieee80211_get_reason_code_string(u16 reason_code);
 
 extern const struct ethtool_ops ieee80211_ethtool_ops;
 
+u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
+				       struct ieee80211_sta *pubsta,
+				       int len);
 #ifdef CONFIG_MAC80211_NOINLINE
 #define debug_noinline noinline
 #else
-- 
cgit v1.2.3-59-g8ed1b


From 3ace10f5b5ad94bdbd4b419dc9da2217d57720a9 Mon Sep 17 00:00:00 2001
From: Kan Yan <kyan@google.com>
Date: Mon, 18 Nov 2019 22:06:09 -0800
Subject: mac80211: Implement Airtime-based Queue Limit (AQL)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order for the Fq_CoDel algorithm integrated in mac80211 layer to operate
effectively to control excessive queueing latency, the CoDel algorithm
requires an accurate measure of how long packets stays in the queue, AKA
sojourn time. The sojourn time measured at the mac80211 layer doesn't
include queueing latency in the lower layer (firmware/hardware) and CoDel
expects lower layer to have a short queue. However, most 802.11ac chipsets
offload tasks such TX aggregation to firmware or hardware, thus have a deep
lower layer queue.

Without a mechanism to control the lower layer queue size, packets only
stay in mac80211 layer transiently before being sent to firmware queue.
As a result, the sojourn time measured by CoDel in the mac80211 layer is
almost always lower than the CoDel latency target, hence CoDel does little
to control the latency, even when the lower layer queue causes excessive
latency.

The Byte Queue Limits (BQL) mechanism is commonly used to address the
similar issue with wired network interface. However, this method cannot be
applied directly to the wireless network interface. "Bytes" is not a
suitable measure of queue depth in the wireless network, as the data rate
can vary dramatically from station to station in the same network, from a
few Mbps to over Gbps.

This patch implements an Airtime-based Queue Limit (AQL) to make CoDel work
effectively with wireless drivers that utilized firmware/hardware
offloading. AQL allows each txq to release just enough packets to the lower
layer to form 1-2 large aggregations to keep hardware fully utilized and
retains the rest of the frames in mac80211 layer to be controlled by the
CoDel algorithm.

Signed-off-by: Kan Yan <kyan@google.com>
[ Toke: Keep API to set pending airtime internal, fix nits in commit msg ]
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20191119060610.76681-4-kyan@google.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h     |  7 ++++
 include/net/mac80211.h     | 12 +++++++
 net/mac80211/debugfs.c     | 85 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/debugfs_sta.c | 43 +++++++++++++++++------
 net/mac80211/ieee80211_i.h |  4 +++
 net/mac80211/main.c        | 10 +++++-
 net/mac80211/sta_info.c    | 38 +++++++++++++++++++++
 net/mac80211/sta_info.h    |  8 +++++
 net/mac80211/tx.c          | 51 ++++++++++++++++++++++++++--
 9 files changed, 244 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5ded77fad7fb..059524b87c4c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2606,6 +2606,13 @@ enum wiphy_params_flags {
 
 #define IEEE80211_DEFAULT_AIRTIME_WEIGHT	256
 
+/* The per TXQ device queue limit in airtime */
+#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L	5000
+#define IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H	12000
+
+/* The per interface airtime threshold to switch to lower queue limit */
+#define IEEE80211_AQL_THRESHOLD			24000
+
 /**
  * struct cfg80211_pmksa - PMK Security Association
  *
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6fc26a051ba0..ba3f33cc41ea 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5565,6 +5565,18 @@ void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid);
 void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
 				    u32 tx_airtime, u32 rx_airtime);
 
+/**
+ * ieee80211_txq_airtime_check - check if a txq can send frame to device
+ *
+ * @hw: pointer obtained from ieee80211_alloc_hw()
+ * @txq: pointer obtained from station or virtual interface
+ *
+ * Return true if the AQL's airtime limit has not been reached and the txq can
+ * continue to send more packets to the device. Otherwise return false.
+ */
+bool
+ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq);
+
 /**
  * ieee80211_iter_keys - iterate keys programmed into the device
  * @hw: pointer obtained from ieee80211_alloc_hw()
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 5c52429038c3..ad41d74530c6 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -150,6 +150,87 @@ static const struct file_operations aqm_ops = {
 	.llseek = default_llseek,
 };
 
+static ssize_t aql_txq_limit_read(struct file *file,
+				  char __user *user_buf,
+				  size_t count,
+				  loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	char buf[400];
+	int len = 0;
+
+	len = scnprintf(buf, sizeof(buf),
+			"AC	AQL limit low	AQL limit high\n"
+			"VO	%u		%u\n"
+			"VI	%u		%u\n"
+			"BE	%u		%u\n"
+			"BK	%u		%u\n",
+			local->aql_txq_limit_low[IEEE80211_AC_VO],
+			local->aql_txq_limit_high[IEEE80211_AC_VO],
+			local->aql_txq_limit_low[IEEE80211_AC_VI],
+			local->aql_txq_limit_high[IEEE80211_AC_VI],
+			local->aql_txq_limit_low[IEEE80211_AC_BE],
+			local->aql_txq_limit_high[IEEE80211_AC_BE],
+			local->aql_txq_limit_low[IEEE80211_AC_BK],
+			local->aql_txq_limit_high[IEEE80211_AC_BK]);
+	return simple_read_from_buffer(user_buf, count, ppos,
+				       buf, len);
+}
+
+static ssize_t aql_txq_limit_write(struct file *file,
+				   const char __user *user_buf,
+				   size_t count,
+				   loff_t *ppos)
+{
+	struct ieee80211_local *local = file->private_data;
+	char buf[100];
+	size_t len;
+	u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old;
+	struct sta_info *sta;
+
+	if (count > sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(buf, user_buf, count))
+		return -EFAULT;
+
+	buf[sizeof(buf) - 1] = 0;
+	len = strlen(buf);
+	if (len > 0 && buf[len - 1] == '\n')
+		buf[len - 1] = 0;
+
+	if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3)
+		return -EINVAL;
+
+	if (ac >= IEEE80211_NUM_ACS)
+		return -EINVAL;
+
+	q_limit_low_old = local->aql_txq_limit_low[ac];
+	q_limit_high_old = local->aql_txq_limit_high[ac];
+
+	local->aql_txq_limit_low[ac] = q_limit_low;
+	local->aql_txq_limit_high[ac] = q_limit_high;
+
+	mutex_lock(&local->sta_mtx);
+	list_for_each_entry(sta, &local->sta_list, list) {
+		/* If a sta has customized queue limits, keep it */
+		if (sta->airtime[ac].aql_limit_low == q_limit_low_old &&
+		    sta->airtime[ac].aql_limit_high == q_limit_high_old) {
+			sta->airtime[ac].aql_limit_low = q_limit_low;
+			sta->airtime[ac].aql_limit_high = q_limit_high;
+		}
+	}
+	mutex_unlock(&local->sta_mtx);
+	return count;
+}
+
+static const struct file_operations aql_txq_limit_ops = {
+	.write = aql_txq_limit_write,
+	.read = aql_txq_limit_read,
+	.open = simple_open,
+	.llseek = default_llseek,
+};
+
 static ssize_t force_tx_status_read(struct file *file,
 				    char __user *user_buf,
 				    size_t count,
@@ -444,6 +525,10 @@ void debugfs_hw_add(struct ieee80211_local *local)
 	debugfs_create_u16("airtime_flags", 0600,
 			   phyd, &local->airtime_flags);
 
+	DEBUGFS_ADD(aql_txq_limit);
+	debugfs_create_u32("aql_threshold", 0600,
+			   phyd, &local->aql_threshold);
+
 	statsd = debugfs_create_dir("statistics", phyd);
 
 	/* if the dir failed, don't put all the other things into the root! */
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index c8ad20c28c43..0185e6e5e5d1 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -197,10 +197,12 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
 {
 	struct sta_info *sta = file->private_data;
 	struct ieee80211_local *local = sta->sdata->local;
-	size_t bufsz = 200;
+	size_t bufsz = 400;
 	char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
 	u64 rx_airtime = 0, tx_airtime = 0;
 	s64 deficit[IEEE80211_NUM_ACS];
+	u32 q_depth[IEEE80211_NUM_ACS];
+	u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS];
 	ssize_t rv;
 	int ac;
 
@@ -212,19 +214,22 @@ static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
 		rx_airtime += sta->airtime[ac].rx_airtime;
 		tx_airtime += sta->airtime[ac].tx_airtime;
 		deficit[ac] = sta->airtime[ac].deficit;
+		q_limit_l[ac] = sta->airtime[ac].aql_limit_low;
+		q_limit_h[ac] = sta->airtime[ac].aql_limit_high;
 		spin_unlock_bh(&local->active_txq_lock[ac]);
+		q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending);
 	}
 
 	p += scnprintf(p, bufsz + buf - p,
 		"RX: %llu us\nTX: %llu us\nWeight: %u\n"
-		"Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n",
-		rx_airtime,
-		tx_airtime,
-		sta->airtime_weight,
-		deficit[0],
-		deficit[1],
-		deficit[2],
-		deficit[3]);
+		"Deficit: VO: %lld us VI: %lld us BE: %lld us BK: %lld us\n"
+		"Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n"
+		"Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n",
+		rx_airtime, tx_airtime, sta->airtime_weight,
+		deficit[0], deficit[1], deficit[2], deficit[3],
+		q_depth[0], q_depth[1], q_depth[2], q_depth[3],
+		q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1],
+		q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]),
 
 	rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
 	kfree(buf);
@@ -236,7 +241,25 @@ static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf,
 {
 	struct sta_info *sta = file->private_data;
 	struct ieee80211_local *local = sta->sdata->local;
-	int ac;
+	u32 ac, q_limit_l, q_limit_h;
+	char _buf[100] = {}, *buf = _buf;
+
+	if (count > sizeof(_buf))
+		return -EINVAL;
+
+	if (copy_from_user(buf, userbuf, count))
+		return -EFAULT;
+
+	buf[sizeof(_buf) - 1] = '\0';
+	if (sscanf(buf, "queue limit %u %u %u", &ac, &q_limit_l, &q_limit_h)
+	    != 3)
+		return -EINVAL;
+
+	if (ac >= IEEE80211_NUM_ACS)
+		return -EINVAL;
+
+	sta->airtime[ac].aql_limit_low = q_limit_l;
+	sta->airtime[ac].aql_limit_high = q_limit_h;
 
 	for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
 		spin_lock_bh(&local->active_txq_lock[ac]);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 225ea4e3cd76..ad15b3be8bb3 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1142,6 +1142,10 @@ struct ieee80211_local {
 	u16 schedule_round[IEEE80211_NUM_ACS];
 
 	u16 airtime_flags;
+	u32 aql_txq_limit_low[IEEE80211_NUM_ACS];
+	u32 aql_txq_limit_high[IEEE80211_NUM_ACS];
+	u32 aql_threshold;
+	atomic_t aql_total_pending_airtime;
 
 	const struct ieee80211_ops *ops;
 
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 2d05c4cfaf6d..6cca0853f183 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -667,8 +667,16 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 	for (i = 0; i < IEEE80211_NUM_ACS; i++) {
 		INIT_LIST_HEAD(&local->active_txqs[i]);
 		spin_lock_init(&local->active_txq_lock[i]);
+		local->aql_txq_limit_low[i] = IEEE80211_DEFAULT_AQL_TXQ_LIMIT_L;
+		local->aql_txq_limit_high[i] =
+			IEEE80211_DEFAULT_AQL_TXQ_LIMIT_H;
 	}
-	local->airtime_flags = AIRTIME_USE_TX | AIRTIME_USE_RX;
+
+	local->airtime_flags = AIRTIME_USE_TX |
+			       AIRTIME_USE_RX |
+			       AIRTIME_USE_AQL;
+	local->aql_threshold = IEEE80211_AQL_THRESHOLD;
+	atomic_set(&local->aql_total_pending_airtime, 0);
 
 	INIT_LIST_HEAD(&local->chanctx_list);
 	mutex_init(&local->chanctx_mtx);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 41bf32080dac..8eafd81e97b4 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -410,6 +410,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
 		skb_queue_head_init(&sta->ps_tx_buf[i]);
 		skb_queue_head_init(&sta->tx_filtered[i]);
 		sta->airtime[i].deficit = sta->airtime_weight;
+		atomic_set(&sta->airtime[i].aql_tx_pending, 0);
+		sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i];
+		sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i];
 	}
 
 	for (i = 0; i < IEEE80211_NUM_TIDS; i++)
@@ -1907,6 +1910,41 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
 }
 EXPORT_SYMBOL(ieee80211_sta_register_airtime);
 
+void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
+					  struct sta_info *sta, u8 ac,
+					  u16 tx_airtime, bool tx_completed)
+{
+	int tx_pending;
+
+	if (!tx_completed) {
+		if (sta)
+			atomic_add(tx_airtime,
+				   &sta->airtime[ac].aql_tx_pending);
+
+		atomic_add(tx_airtime, &local->aql_total_pending_airtime);
+		return;
+	}
+
+	if (sta) {
+		tx_pending = atomic_sub_return(tx_airtime,
+					       &sta->airtime[ac].aql_tx_pending);
+		if (WARN_ONCE(tx_pending < 0,
+			      "STA %pM AC %d txq pending airtime underflow: %u, %u",
+			      sta->addr, ac, tx_pending, tx_airtime))
+			atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending,
+				       tx_pending, 0);
+	}
+
+	tx_pending = atomic_sub_return(tx_airtime,
+				       &local->aql_total_pending_airtime);
+	if (WARN_ONCE(tx_pending < 0,
+		      "Device %s AC %d pending airtime underflow: %u, %u",
+		      wiphy_name(local->hw.wiphy), ac, tx_pending,
+		      tx_airtime))
+		atomic_cmpxchg(&local->aql_total_pending_airtime,
+			       tx_pending, 0);
+}
+
 int sta_info_move_state(struct sta_info *sta,
 			enum ieee80211_sta_state new_state)
 {
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 0bd69a794758..ad5d8a4ae56d 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -127,13 +127,21 @@ enum ieee80211_agg_stop_reason {
 /* Debugfs flags to enable/disable use of RX/TX airtime in scheduler */
 #define AIRTIME_USE_TX		BIT(0)
 #define AIRTIME_USE_RX		BIT(1)
+#define AIRTIME_USE_AQL		BIT(2)
 
 struct airtime_info {
 	u64 rx_airtime;
 	u64 tx_airtime;
 	s64 deficit;
+	atomic_t aql_tx_pending; /* Estimated airtime for frames pending */
+	u32 aql_limit_low;
+	u32 aql_limit_high;
 };
 
+void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
+					  struct sta_info *sta, u8 ac,
+					  u16 tx_airtime, bool tx_completed);
+
 struct sta_info;
 
 /**
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index a53af8cd3756..c7b9b024d0f0 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3677,7 +3677,8 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_txq *ret = NULL;
-	struct txq_info *txqi = NULL;
+	struct txq_info *txqi = NULL, *head = NULL;
+	bool found_eligible_txq = false;
 
 	spin_lock_bh(&local->active_txq_lock[ac]);
 
@@ -3688,13 +3689,30 @@ struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw, u8 ac)
 	if (!txqi)
 		goto out;
 
+	if (txqi == head) {
+		if (!found_eligible_txq)
+			goto out;
+		else
+			found_eligible_txq = false;
+	}
+
+	if (!head)
+		head = txqi;
+
 	if (txqi->txq.sta) {
 		struct sta_info *sta = container_of(txqi->txq.sta,
-						struct sta_info, sta);
+						    struct sta_info, sta);
+		bool aql_check = ieee80211_txq_airtime_check(hw, &txqi->txq);
+		s64 deficit = sta->airtime[txqi->txq.ac].deficit;
 
-		if (sta->airtime[txqi->txq.ac].deficit < 0) {
+		if (aql_check)
+			found_eligible_txq = true;
+
+		if (deficit < 0)
 			sta->airtime[txqi->txq.ac].deficit +=
 				sta->airtime_weight;
+
+		if (deficit < 0 || !aql_check) {
 			list_move_tail(&txqi->schedule_order,
 				       &local->active_txqs[txqi->txq.ac]);
 			goto begin;
@@ -3748,6 +3766,33 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL(__ieee80211_schedule_txq);
 
+bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw,
+				 struct ieee80211_txq *txq)
+{
+	struct sta_info *sta;
+	struct ieee80211_local *local = hw_to_local(hw);
+
+	if (!(local->airtime_flags & AIRTIME_USE_AQL))
+		return true;
+
+	if (!txq->sta)
+		return true;
+
+	sta = container_of(txq->sta, struct sta_info, sta);
+	if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
+	    sta->airtime[txq->ac].aql_limit_low)
+		return true;
+
+	if (atomic_read(&local->aql_total_pending_airtime) <
+	    local->aql_threshold &&
+	    atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
+	    sta->airtime[txq->ac].aql_limit_high)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(ieee80211_txq_airtime_check);
+
 bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,
 				struct ieee80211_txq *txq)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 7a89233ac50468a3a9636803a85d06c8f907f8ee Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Mon, 18 Nov 2019 22:06:10 -0800
Subject: mac80211: Use Airtime-based Queue Limits (AQL) on packet dequeue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous commit added the ability to throttle stations when they queue
too much airtime in the hardware. This commit enables the functionality by
calculating the expected airtime usage of each packet that is dequeued from
the TXQs in mac80211, and accounting that as pending airtime.

The estimated airtime for each skb is stored in the tx_info, so we can
subtract the same amount from the running total when the skb is freed or
recycled. The throttling mechanism relies on this accounting to be
accurate (i.e., that we are not freeing skbs without subtracting any
airtime they were accounted for), so we put the subtraction into
ieee80211_report_used_skb(). As an optimisation, we also subtract the
airtime on regular TX completion, zeroing out the value stored in the
packet afterwards, to avoid having to do an expensive lookup of the station
from the packet data on every packet.

This patch does *not* include any mechanism to wake a throttled TXQ again,
on the assumption that this will happen anyway as a side effect of whatever
freed the skb (most commonly a TX completion).

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20191119060610.76681-5-kyan@google.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 16 ++++++++++++++++
 net/mac80211/status.c  | 26 ++++++++++++++++++++++++++
 net/mac80211/tx.c      | 18 ++++++++++++++++++
 3 files changed, 60 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ba3f33cc41ea..aa145808e57a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1060,6 +1060,22 @@ struct ieee80211_tx_info {
 	};
 };
 
+static inline u16
+ieee80211_info_set_tx_time_est(struct ieee80211_tx_info *info, u16 tx_time_est)
+{
+	/* We only have 10 bits in tx_time_est, so store airtime
+	 * in increments of 4us and clamp the maximum to 2**12-1
+	 */
+	info->tx_time_est = min_t(u16, tx_time_est, 4095) >> 2;
+	return info->tx_time_est << 2;
+}
+
+static inline u16
+ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info)
+{
+	return info->tx_time_est << 2;
+}
+
 /**
  * struct ieee80211_tx_status - extended tx status info for rate control
  *
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 7b39ed86a8ad..b720feaf9a74 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -670,12 +670,26 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 				      struct sk_buff *skb, bool dropped)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	u16 tx_time_est = ieee80211_info_get_tx_time_est(info);
 	struct ieee80211_hdr *hdr = (void *)skb->data;
 	bool acked = info->flags & IEEE80211_TX_STAT_ACK;
 
 	if (dropped)
 		acked = false;
 
+	if (tx_time_est) {
+		struct sta_info *sta;
+
+		rcu_read_lock();
+
+		sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2);
+		ieee80211_sta_update_pending_airtime(local, sta,
+						     skb_get_queue_mapping(skb),
+						     tx_time_est,
+						     true);
+		rcu_read_unlock();
+	}
+
 	if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) {
 		struct ieee80211_sub_if_data *sdata;
 
@@ -877,6 +891,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 	struct ieee80211_bar *bar;
 	int shift = 0;
 	int tid = IEEE80211_NUM_TIDS;
+	u16 tx_time_est;
 
 	rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
 
@@ -986,6 +1001,17 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 			ieee80211_sta_register_airtime(&sta->sta, tid,
 						       info->status.tx_time, 0);
 
+		if ((tx_time_est = ieee80211_info_get_tx_time_est(info)) > 0) {
+			/* Do this here to avoid the expensive lookup of the sta
+			 * in ieee80211_report_used_skb().
+			 */
+			ieee80211_sta_update_pending_airtime(local, sta,
+							     skb_get_queue_mapping(skb),
+							     tx_time_est,
+							     true);
+			ieee80211_info_set_tx_time_est(info, 0);
+		}
+
 		if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
 			if (info->flags & IEEE80211_TX_STAT_ACK) {
 				if (sta->status_stats.lost_packets)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index c7b9b024d0f0..b696b9136f4c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -3554,6 +3554,9 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 
 	WARN_ON_ONCE(softirq_count() == 0);
 
+	if (!ieee80211_txq_airtime_check(hw, txq))
+		return NULL;
+
 begin:
 	spin_lock_bh(&fq->lock);
 
@@ -3664,6 +3667,21 @@ begin:
 	}
 
 	IEEE80211_SKB_CB(skb)->control.vif = vif;
+
+	if (local->airtime_flags & AIRTIME_USE_AQL) {
+		u32 airtime;
+
+		airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta,
+							     skb->len);
+		if (airtime) {
+			airtime = ieee80211_info_set_tx_time_est(info, airtime);
+			ieee80211_sta_update_pending_airtime(local, tx.sta,
+							     txq->ac,
+							     airtime,
+							     false);
+		}
+	}
+
 	return skb;
 
 out:
-- 
cgit v1.2.3-59-g8ed1b


From d1746d1e80a86ca86b0c2680510898d411d2ef47 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Fri, 22 Nov 2019 15:47:21 +0000
Subject: net: flow_dissector: Wrap unionized VLAN fields in a struct

In commit a82055af5959 ("netfilter: nft_payload: add VLAN offload
support"), VLAN fields in struct flow_dissector_key_vlan were unionized
with the intention of introducing another field that covered the whole TCI
header. However without a wrapping struct the subfields end up sharing the
same bits. As a result, "tc filter add ... flower vlan_id 14" specifies not
only vlan_id, but also vlan_priority.

Fix by wrapping the individual VLAN fields in a struct.

Fixes: a82055af5959 ("netfilter: nft_payload: add VLAN offload support")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index f06b0239c32b..b8c20e9f343e 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -49,9 +49,11 @@ struct flow_dissector_key_tags {
 
 struct flow_dissector_key_vlan {
 	union {
-		u16	vlan_id:12,
-			vlan_dei:1,
-			vlan_priority:3;
+		struct {
+			u16	vlan_id:12,
+				vlan_dei:1,
+				vlan_priority:3;
+		};
 		__be16	vlan_tci;
 	};
 	__be16	vlan_tpid;
-- 
cgit v1.2.3-59-g8ed1b


From a18fab48dbacbb7ff104a13e987778b7995bec07 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@mellanox.com>
Date: Mon, 28 Oct 2019 16:58:53 +0200
Subject: net/mlx5: DR, Add HW bits and definitions for Geneve flex parser

Add definition for flex parser tunneling header for Geneve.

Signed-off-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Reviewed-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h  | 13 +++++++++++++
 include/linux/mlx5/mlx5_ifc.h                               |  1 +
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h
index 6d78b027fe56..1722f4668269 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5_ifc_dr.h
@@ -559,6 +559,19 @@ struct mlx5_ifc_ste_flex_parser_tnl_vxlan_gpe_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_ste_flex_parser_tnl_geneve_bits {
+	u8         reserved_at_0[0x2];
+	u8         geneve_opt_len[0x6];
+	u8         geneve_oam[0x1];
+	u8         reserved_at_9[0x7];
+	u8         geneve_protocol_type[0x10];
+
+	u8         geneve_vni[0x18];
+	u8         reserved_at_38[0x8];
+
+	u8         reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_ste_general_purpose_bits {
 	u8         general_purpose_lookup_field[0x20];
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4f912d4e67bc..5d54fccf87fc 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1110,6 +1110,7 @@ enum {
 };
 
 enum {
+	MLX5_FLEX_PARSER_GENEVE_ENABLED		= 1 << 3,
 	MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED	= 1 << 7,
 	MLX5_FLEX_PARSER_ICMP_V4_ENABLED	= 1 << 8,
 	MLX5_FLEX_PARSER_ICMP_V6_ENABLED	= 1 << 9,
-- 
cgit v1.2.3-59-g8ed1b


From 30429fba99b51836ea8a11174be95ddaa8c47703 Mon Sep 17 00:00:00 2001
From: Maciej Żenczykowski <maze@google.com>
Date: Fri, 22 Nov 2019 13:50:52 -0800
Subject: net: inet_is_local_reserved_port() should return bool not int
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/ip.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index a2c61c36dc4a..cebf3e10def1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -339,10 +339,10 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o
 void inet_get_local_port_range(struct net *net, int *low, int *high);
 
 #ifdef CONFIG_SYSCTL
-static inline int inet_is_local_reserved_port(struct net *net, int port)
+static inline bool inet_is_local_reserved_port(struct net *net, int port)
 {
 	if (!net->ipv4.sysctl_local_reserved_ports)
-		return 0;
+		return false;
 	return test_bit(port, net->ipv4.sysctl_local_reserved_ports);
 }
 
@@ -357,9 +357,9 @@ static inline int inet_prot_sock(struct net *net)
 }
 
 #else
-static inline int inet_is_local_reserved_port(struct net *net, int port)
+static inline bool inet_is_local_reserved_port(struct net *net, int port)
 {
-	return 0;
+	return false;
 }
 
 static inline int inet_prot_sock(struct net *net)
-- 
cgit v1.2.3-59-g8ed1b


From 84bb46cd62283cc371769ec1f77ff7924099f584 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Sat, 23 Nov 2019 09:54:58 -0800
Subject: Revert "bpf: Emit audit messages upon successful prog load and
 unload"

This commit reverts commit 91e6015b082b ("bpf: Emit audit messages
upon successful prog load and unload") and its follow up commit
7599a896f2e4 ("audit: Move audit_log_task declaration under
CONFIG_AUDITSYSCALL") as requested by Paul Moore. The change needs
close review on linux-audit, tests etc.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/linux/audit.h      |  5 -----
 include/uapi/linux/audit.h |  1 -
 kernel/auditsc.c           |  2 +-
 kernel/bpf/syscall.c       | 31 -------------------------------
 4 files changed, 1 insertion(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 18925d924c73..aee3dc9eb378 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -358,8 +358,6 @@ static inline void audit_ptrace(struct task_struct *t)
 		__audit_ptrace(t);
 }
 
-extern void audit_log_task(struct audit_buffer *ab);
-
 				/* Private API (for audit.c only) */
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
@@ -647,9 +645,6 @@ static inline void audit_ntp_log(const struct audit_ntp_data *ad)
 
 static inline void audit_ptrace(struct task_struct *t)
 { }
-
-static inline void audit_log_task(struct audit_buffer *ab)
-{ }
 #define audit_n_rules 0
 #define audit_signals 0
 #endif /* CONFIG_AUDITSYSCALL */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 32a5db900f47..c89c6495983d 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -116,7 +116,6 @@
 #define AUDIT_FANOTIFY		1331	/* Fanotify access decision */
 #define AUDIT_TIME_INJOFFSET	1332	/* Timekeeping offset injected */
 #define AUDIT_TIME_ADJNTPVAL	1333	/* NTP value adjustment */
-#define AUDIT_BPF		1334	/* BPF subsystem */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9bf1045fedfa..4effe01ebbe2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2545,7 +2545,7 @@ void __audit_ntp_log(const struct audit_ntp_data *ad)
 	audit_log_ntp_val(ad, "adjust",	AUDIT_NTP_ADJUST);
 }
 
-void audit_log_task(struct audit_buffer *ab)
+static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
 	kgid_t gid;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b51ecb9644d0..4ae52eb05f41 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,7 +23,6 @@
 #include <linux/timekeeping.h>
 #include <linux/ctype.h>
 #include <linux/nospec.h>
-#include <linux/audit.h>
 #include <uapi/linux/btf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
@@ -1322,34 +1321,6 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 	kfree(aux->used_maps);
 }
 
-enum bpf_event {
-	BPF_EVENT_LOAD,
-	BPF_EVENT_UNLOAD,
-};
-
-static const char * const bpf_event_audit_str[] = {
-	[BPF_EVENT_LOAD]   = "LOAD",
-	[BPF_EVENT_UNLOAD] = "UNLOAD",
-};
-
-static void bpf_audit_prog(const struct bpf_prog *prog, enum bpf_event event)
-{
-	bool has_task_context = event == BPF_EVENT_LOAD;
-	struct audit_buffer *ab;
-
-	if (audit_enabled == AUDIT_OFF)
-		return;
-	ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_BPF);
-	if (unlikely(!ab))
-		return;
-	if (has_task_context)
-		audit_log_task(ab);
-	audit_log_format(ab, "%sprog-id=%u event=%s",
-			 has_task_context ? " " : "",
-			 prog->aux->id, bpf_event_audit_str[event]);
-	audit_log_end(ab);
-}
-
 int __bpf_prog_charge(struct user_struct *user, u32 pages)
 {
 	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1466,7 +1437,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic64_dec_and_test(&prog->aux->refcnt)) {
 		perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
-		bpf_audit_prog(prog, BPF_EVENT_UNLOAD);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 		__bpf_prog_put_noref(prog, true);
@@ -1876,7 +1846,6 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	 */
 	bpf_prog_kallsyms_add(prog);
 	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
-	bpf_audit_prog(prog, BPF_EVENT_LOAD);
 
 	err = bpf_prog_new_fd(prog);
 	if (err < 0)
-- 
cgit v1.2.3-59-g8ed1b


From e3cf8b3668a808c1d252269ffc34a5723cfb9a7b Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 22 Nov 2019 12:37:08 +0000
Subject: net: phy: remove phy_ethtool_sset()

There are no users of phy_ethtool_sset() in the kernel anymore, and
as of commit 3c1bcc8614db ("net: ethernet: Convert phydev advertize
and supported from u32 to link mode"), the implementation is slightly
buggy - it doesn't correctly check the masked advertising mask as it
used to.

Remove it, and update the phy documentation to refer to its replacement
function.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 Documentation/networking/phy.rst |  3 +-
 drivers/net/phy/phy.c            | 60 ----------------------------------------
 include/linux/phy.h              |  1 -
 3 files changed, 2 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index a689966bc4be..cda1c0a0492a 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -352,7 +352,8 @@ Fills the phydev structure with up-to-date information about the current
 settings in the PHY.
 ::
 
- int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
+ int phy_ethtool_ksettings_set(struct phy_device *phydev,
+                               const struct ethtool_link_ksettings *cmd);
 
 Ethtool convenience functions.
 ::
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 36d4ffe1cd3f..80be4d691e5b 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -253,66 +253,6 @@ static void phy_sanitize_settings(struct phy_device *phydev)
 	}
 }
 
-/**
- * phy_ethtool_sset - generic ethtool sset function, handles all the details
- * @phydev: target phy_device struct
- * @cmd: ethtool_cmd
- *
- * A few notes about parameter checking:
- *
- * - We don't set port or transceiver, so we don't care what they
- *   were set to.
- * - phy_start_aneg() will make sure forced settings are sane, and
- *   choose the next best ones from the ones selected, so we don't
- *   care if ethtool tries to give us bad values.
- */
-int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd)
-{
-	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
-	u32 speed = ethtool_cmd_speed(cmd);
-
-	if (cmd->phy_address != phydev->mdio.addr)
-		return -EINVAL;
-
-	/* We make sure that we don't pass unsupported values in to the PHY */
-	ethtool_convert_legacy_u32_to_link_mode(advertising, cmd->advertising);
-	linkmode_and(advertising, advertising, phydev->supported);
-
-	/* Verify the settings we care about. */
-	if (cmd->autoneg != AUTONEG_ENABLE && cmd->autoneg != AUTONEG_DISABLE)
-		return -EINVAL;
-
-	if (cmd->autoneg == AUTONEG_ENABLE && cmd->advertising == 0)
-		return -EINVAL;
-
-	if (cmd->autoneg == AUTONEG_DISABLE &&
-	    ((speed != SPEED_1000 &&
-	      speed != SPEED_100 &&
-	      speed != SPEED_10) ||
-	     (cmd->duplex != DUPLEX_HALF &&
-	      cmd->duplex != DUPLEX_FULL)))
-		return -EINVAL;
-
-	phydev->autoneg = cmd->autoneg;
-
-	phydev->speed = speed;
-
-	linkmode_copy(phydev->advertising, advertising);
-
-	linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
-			 phydev->advertising, AUTONEG_ENABLE == cmd->autoneg);
-
-	phydev->duplex = cmd->duplex;
-
-	phydev->mdix_ctrl = cmd->eth_tp_mdix_ctrl;
-
-	/* Restart the PHY */
-	phy_start_aneg(phydev);
-
-	return 0;
-}
-EXPORT_SYMBOL(phy_ethtool_sset);
-
 int phy_ethtool_ksettings_set(struct phy_device *phydev,
 			      const struct ethtool_link_ksettings *cmd)
 {
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 124516fe2763..f5cdfb206097 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1160,7 +1160,6 @@ void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies);
 void phy_mac_interrupt(struct phy_device *phydev);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
-int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 void phy_ethtool_ksettings_get(struct phy_device *phydev,
 			       struct ethtool_link_ksettings *cmd);
 int phy_ethtool_ksettings_set(struct phy_device *phydev,
-- 
cgit v1.2.3-59-g8ed1b


From d46b7e4fb06037a61415f5b6964fcf632ee1dc34 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 21 Nov 2019 00:36:22 +0000
Subject: net: phylink: rename mac_link_state() op to mac_pcs_get_state()

Rename the mac_link_state() method to mac_pcs_get_state() to make it
clear that it should be returning the MACs PCS current state, which
is used for inband negotiation rather than just reading back what the
MAC has been configured for. Update the documentation to explicitly
mention that this is for inband.

We drop the return value as well; most of phylink doesn't check the
return value and it is not clear what it should do on error - instead
arrange for state->link to be false.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/cadence/macb_main.c          |  8 ++++----
 drivers/net/ethernet/marvell/mvneta.c             |  8 +++-----
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c   | 21 +++++++++----------
 drivers/net/ethernet/mediatek/mtk_eth_soc.c       |  8 +++-----
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  8 ++++----
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c |  8 +++-----
 drivers/net/phy/phylink.c                         | 15 ++++++--------
 include/linux/phylink.h                           | 25 ++++++++++++-----------
 net/dsa/dsa_priv.h                                |  4 ++--
 net/dsa/port.c                                    | 19 +++++++++--------
 10 files changed, 59 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 8fc2e21f0bb1..d5ae2e1e0b0e 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -505,10 +505,10 @@ static void macb_validate(struct phylink_config *config,
 		   __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
-static int macb_mac_link_state(struct phylink_config *config,
-			       struct phylink_link_state *state)
+static void macb_mac_pcs_get_state(struct phylink_config *config,
+				   struct phylink_link_state *state)
 {
-	return -EOPNOTSUPP;
+	state->link = 0;
 }
 
 static void macb_mac_an_restart(struct phylink_config *config)
@@ -604,7 +604,7 @@ static void macb_mac_link_up(struct phylink_config *config, unsigned int mode,
 
 static const struct phylink_mac_ops macb_phylink_ops = {
 	.validate = macb_validate,
-	.mac_link_state = macb_mac_link_state,
+	.mac_pcs_get_state = macb_mac_pcs_get_state,
 	.mac_an_restart = macb_mac_an_restart,
 	.mac_config = macb_mac_config,
 	.mac_link_down = macb_mac_link_down,
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index a06d109c9e80..71a872d46bc4 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -3694,8 +3694,8 @@ static void mvneta_validate(struct phylink_config *config,
 	phylink_helper_basex_speed(state);
 }
 
-static int mvneta_mac_link_state(struct phylink_config *config,
-				 struct phylink_link_state *state)
+static void mvneta_mac_pcs_get_state(struct phylink_config *config,
+				     struct phylink_link_state *state)
 {
 	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
@@ -3721,8 +3721,6 @@ static int mvneta_mac_link_state(struct phylink_config *config,
 		state->pause |= MLO_PAUSE_RX;
 	if (gmac_stat & MVNETA_GMAC_TX_FLOW_CTRL_ENABLE)
 		state->pause |= MLO_PAUSE_TX;
-
-	return 1;
 }
 
 static void mvneta_mac_an_restart(struct phylink_config *config)
@@ -3915,7 +3913,7 @@ static void mvneta_mac_link_up(struct phylink_config *config, unsigned int mode,
 
 static const struct phylink_mac_ops mvneta_phylink_ops = {
 	.validate = mvneta_validate,
-	.mac_link_state = mvneta_mac_link_state,
+	.mac_pcs_get_state = mvneta_mac_pcs_get_state,
 	.mac_an_restart = mvneta_mac_an_restart,
 	.mac_config = mvneta_mac_config,
 	.mac_link_down = mvneta_mac_link_down,
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 17e24c1e1c2b..62dc2f362a16 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -4823,8 +4823,8 @@ empty_set:
 	bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
-static void mvpp22_xlg_link_state(struct mvpp2_port *port,
-				  struct phylink_link_state *state)
+static void mvpp22_xlg_pcs_get_state(struct mvpp2_port *port,
+				     struct phylink_link_state *state)
 {
 	u32 val;
 
@@ -4843,8 +4843,8 @@ static void mvpp22_xlg_link_state(struct mvpp2_port *port,
 		state->pause |= MLO_PAUSE_RX;
 }
 
-static void mvpp2_gmac_link_state(struct mvpp2_port *port,
-				  struct phylink_link_state *state)
+static void mvpp2_gmac_pcs_get_state(struct mvpp2_port *port,
+				     struct phylink_link_state *state)
 {
 	u32 val;
 
@@ -4877,8 +4877,8 @@ static void mvpp2_gmac_link_state(struct mvpp2_port *port,
 		state->pause |= MLO_PAUSE_TX;
 }
 
-static int mvpp2_phylink_mac_link_state(struct phylink_config *config,
-					struct phylink_link_state *state)
+static void mvpp2_phylink_mac_pcs_get_state(struct phylink_config *config,
+					    struct phylink_link_state *state)
 {
 	struct mvpp2_port *port = container_of(config, struct mvpp2_port,
 					       phylink_config);
@@ -4888,13 +4888,12 @@ static int mvpp2_phylink_mac_link_state(struct phylink_config *config,
 		mode &= MVPP22_XLG_CTRL3_MACMODESELECT_MASK;
 
 		if (mode == MVPP22_XLG_CTRL3_MACMODESELECT_10G) {
-			mvpp22_xlg_link_state(port, state);
-			return 1;
+			mvpp22_xlg_pcs_get_state(port, state);
+			return;
 		}
 	}
 
-	mvpp2_gmac_link_state(port, state);
-	return 1;
+	mvpp2_gmac_pcs_get_state(port, state);
 }
 
 static void mvpp2_mac_an_restart(struct phylink_config *config)
@@ -5186,7 +5185,7 @@ static void mvpp2_mac_link_down(struct phylink_config *config,
 
 static const struct phylink_mac_ops mvpp2_phylink_ops = {
 	.validate = mvpp2_phylink_validate,
-	.mac_link_state = mvpp2_phylink_mac_link_state,
+	.mac_pcs_get_state = mvpp2_phylink_mac_pcs_get_state,
 	.mac_an_restart = mvpp2_mac_an_restart,
 	.mac_config = mvpp2_mac_config,
 	.mac_link_up = mvpp2_mac_link_up,
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 1923ba76a1ec..527ad2aadcca 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -361,8 +361,8 @@ init_err:
 		mac->id, phy_modes(state->interface), err);
 }
 
-static int mtk_mac_link_state(struct phylink_config *config,
-			      struct phylink_link_state *state)
+static void mtk_mac_pcs_get_state(struct phylink_config *config,
+				  struct phylink_link_state *state)
 {
 	struct mtk_mac *mac = container_of(config, struct mtk_mac,
 					   phylink_config);
@@ -391,8 +391,6 @@ static int mtk_mac_link_state(struct phylink_config *config,
 		state->pause |= MLO_PAUSE_RX;
 	if (pmsr & MAC_MSR_TX_FC)
 		state->pause |= MLO_PAUSE_TX;
-
-	return 1;
 }
 
 static void mtk_mac_an_restart(struct phylink_config *config)
@@ -514,7 +512,7 @@ static void mtk_validate(struct phylink_config *config,
 
 static const struct phylink_mac_ops mtk_phylink_ops = {
 	.validate = mtk_validate,
-	.mac_link_state = mtk_mac_link_state,
+	.mac_pcs_get_state = mtk_mac_pcs_get_state,
 	.mac_an_restart = mtk_mac_an_restart,
 	.mac_config = mtk_mac_config,
 	.mac_link_down = mtk_mac_link_down,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 8cc4cd0cc515..644cb5d1fd4f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -868,10 +868,10 @@ static void stmmac_validate(struct phylink_config *config,
 		      __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
-static int stmmac_mac_link_state(struct phylink_config *config,
-				 struct phylink_link_state *state)
+static void stmmac_mac_pcs_get_state(struct phylink_config *config,
+				     struct phylink_link_state *state)
 {
-	return -EOPNOTSUPP;
+	state->link = 0;
 }
 
 static void stmmac_mac_config(struct phylink_config *config, unsigned int mode,
@@ -965,7 +965,7 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 
 static const struct phylink_mac_ops stmmac_phylink_mac_ops = {
 	.validate = stmmac_validate,
-	.mac_link_state = stmmac_mac_link_state,
+	.mac_pcs_get_state = stmmac_mac_pcs_get_state,
 	.mac_config = stmmac_mac_config,
 	.mac_an_restart = stmmac_mac_an_restart,
 	.mac_link_down = stmmac_mac_link_down,
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 8f32db6d2c45..20746b801959 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1405,8 +1405,8 @@ static void axienet_validate(struct phylink_config *config,
 		   __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
-static int axienet_mac_link_state(struct phylink_config *config,
-				  struct phylink_link_state *state)
+static void axienet_mac_pcs_get_state(struct phylink_config *config,
+				      struct phylink_link_state *state)
 {
 	struct net_device *ndev = to_net_dev(config->dev);
 	struct axienet_local *lp = netdev_priv(ndev);
@@ -1431,8 +1431,6 @@ static int axienet_mac_link_state(struct phylink_config *config,
 
 	state->an_complete = 0;
 	state->duplex = 1;
-
-	return 1;
 }
 
 static void axienet_mac_an_restart(struct phylink_config *config)
@@ -1497,7 +1495,7 @@ static void axienet_mac_link_up(struct phylink_config *config,
 
 static const struct phylink_mac_ops axienet_phylink_ops = {
 	.validate = axienet_validate,
-	.mac_link_state = axienet_mac_link_state,
+	.mac_pcs_get_state = axienet_mac_pcs_get_state,
 	.mac_an_restart = axienet_mac_an_restart,
 	.mac_config = axienet_mac_config,
 	.mac_link_down = axienet_mac_link_down,
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 8e2a12885789..9a616d6bc4eb 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -357,9 +357,9 @@ static void phylink_mac_an_restart(struct phylink *pl)
 		pl->ops->mac_an_restart(pl->config);
 }
 
-static int phylink_get_mac_state(struct phylink *pl, struct phylink_link_state *state)
+static void phylink_mac_pcs_get_state(struct phylink *pl,
+				      struct phylink_link_state *state)
 {
-
 	linkmode_copy(state->advertising, pl->link_config.advertising);
 	linkmode_zero(state->lp_advertising);
 	state->interface = pl->link_config.interface;
@@ -370,7 +370,7 @@ static int phylink_get_mac_state(struct phylink *pl, struct phylink_link_state *
 	state->an_complete = 0;
 	state->link = 1;
 
-	return pl->ops->mac_link_state(pl->config, state);
+	pl->ops->mac_pcs_get_state(pl->config, state);
 }
 
 /* The fixed state is... fixed except for the link state,
@@ -493,7 +493,7 @@ static void phylink_resolve(struct work_struct *w)
 			break;
 
 		case MLO_AN_INBAND:
-			phylink_get_mac_state(pl, &link_state);
+			phylink_mac_pcs_get_state(pl, &link_state);
 
 			/* If we have a phy, the "up" state is the union of
 			 * both the PHY and the MAC */
@@ -1142,7 +1142,7 @@ int phylink_ethtool_ksettings_get(struct phylink *pl,
 		if (pl->phydev)
 			break;
 
-		phylink_get_mac_state(pl, &link_state);
+		phylink_mac_pcs_get_state(pl, &link_state);
 
 		/* The MAC is reporting the link results from its own PCS
 		 * layer via in-band status. Report these as the current
@@ -1561,10 +1561,7 @@ static int phylink_mii_read(struct phylink *pl, unsigned int phy_id,
 
 	case MLO_AN_INBAND:
 		if (phy_id == 0) {
-			val = phylink_get_mac_state(pl, &state);
-			if (val < 0)
-				return val;
-
+			phylink_mac_pcs_get_state(pl, &state);
 			val = phylink_mii_emul_read(reg, &state);
 		}
 		break;
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 300ecdb6790a..fed5488e3c75 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -72,7 +72,7 @@ struct phylink_config {
 /**
  * struct phylink_mac_ops - MAC operations structure.
  * @validate: Validate and update the link configuration.
- * @mac_link_state: Read the current link state from the hardware.
+ * @mac_pcs_get_state: Read the current link state from the hardware.
  * @mac_config: configure the MAC for the selected mode and state.
  * @mac_an_restart: restart 802.3z BaseX autonegotiation.
  * @mac_link_down: take the link down.
@@ -84,8 +84,8 @@ struct phylink_mac_ops {
 	void (*validate)(struct phylink_config *config,
 			 unsigned long *supported,
 			 struct phylink_link_state *state);
-	int (*mac_link_state)(struct phylink_config *config,
-			      struct phylink_link_state *state);
+	void (*mac_pcs_get_state)(struct phylink_config *config,
+				  struct phylink_link_state *state);
 	void (*mac_config)(struct phylink_config *config, unsigned int mode,
 			   const struct phylink_link_state *state);
 	void (*mac_an_restart)(struct phylink_config *config);
@@ -127,18 +127,19 @@ void validate(struct phylink_config *config, unsigned long *supported,
 	      struct phylink_link_state *state);
 
 /**
- * mac_link_state() - Read the current link state from the hardware
+ * mac_pcs_get_state() - Read the current inband link state from the hardware
  * @config: a pointer to a &struct phylink_config.
  * @state: a pointer to a &struct phylink_link_state.
  *
- * Read the current link state from the MAC, reporting the current
- * speed in @state->speed, duplex mode in @state->duplex, pause mode
- * in @state->pause using the %MLO_PAUSE_RX and %MLO_PAUSE_TX bits,
- * negotiation completion state in @state->an_complete, and link
- * up state in @state->link.
+ * Read the current inband link state from the MAC PCS, reporting the
+ * current speed in @state->speed, duplex mode in @state->duplex, pause
+ * mode in @state->pause using the %MLO_PAUSE_RX and %MLO_PAUSE_TX bits,
+ * negotiation completion state in @state->an_complete, and link up state
+ * in @state->link. If possible, @state->lp_advertising should also be
+ * populated.
  */
-int mac_link_state(struct phylink_config *config,
-		   struct phylink_link_state *state);
+void mac_pcs_get_state(struct phylink_config *config,
+		       struct phylink_link_state *state);
 
 /**
  * mac_config() - configure the MAC for the selected mode and state
@@ -166,7 +167,7 @@ int mac_link_state(struct phylink_config *config,
  *   1000base-X or Cisco SGMII mode depending on the @state->interface
  *   mode). In both cases, link state management (whether the link
  *   is up or not) is performed by the MAC, and reported via the
- *   mac_link_state() callback. Changes in link state must be made
+ *   mac_pcs_get_state() callback. Changes in link state must be made
  *   by calling phylink_mac_change().
  *
  *   If in 802.3z mode, the link speed is fixed, dependent on the
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 53e7577896b6..2dd86d9bcda9 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -153,8 +153,8 @@ void dsa_port_link_unregister_of(struct dsa_port *dp);
 void dsa_port_phylink_validate(struct phylink_config *config,
 			       unsigned long *supported,
 			       struct phylink_link_state *state);
-int dsa_port_phylink_mac_link_state(struct phylink_config *config,
-				    struct phylink_link_state *state);
+void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config,
+					struct phylink_link_state *state);
 void dsa_port_phylink_mac_config(struct phylink_config *config,
 				 unsigned int mode,
 				 const struct phylink_link_state *state);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 6e93c36bf0c0..46ac9ba21987 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -429,19 +429,22 @@ void dsa_port_phylink_validate(struct phylink_config *config,
 }
 EXPORT_SYMBOL_GPL(dsa_port_phylink_validate);
 
-int dsa_port_phylink_mac_link_state(struct phylink_config *config,
-				    struct phylink_link_state *state)
+void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config,
+					struct phylink_link_state *state)
 {
 	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
 
-	/* Only called for SGMII and 802.3z */
-	if (!ds->ops->phylink_mac_link_state)
-		return -EOPNOTSUPP;
+	/* Only called for inband modes */
+	if (!ds->ops->phylink_mac_link_state) {
+		state->link = 0;
+		return;
+	}
 
-	return ds->ops->phylink_mac_link_state(ds, dp->index, state);
+	if (ds->ops->phylink_mac_link_state(ds, dp->index, state) < 0)
+		state->link = 0;
 }
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_state);
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_pcs_get_state);
 
 void dsa_port_phylink_mac_config(struct phylink_config *config,
 				 unsigned int mode,
@@ -510,7 +513,7 @@ EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_link_up);
 
 const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
 	.validate = dsa_port_phylink_validate,
-	.mac_link_state = dsa_port_phylink_mac_link_state,
+	.mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state,
 	.mac_config = dsa_port_phylink_mac_config,
 	.mac_an_restart = dsa_port_phylink_mac_an_restart,
 	.mac_link_down = dsa_port_phylink_mac_link_down,
-- 
cgit v1.2.3-59-g8ed1b


From 312434617cb16be5166316cf9d08ba760b1042a1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 23 Nov 2019 11:56:49 +0800
Subject: sctp: cache netns in sctp_ep_common

This patch is to fix a data-race reported by syzbot:

  BUG: KCSAN: data-race in sctp_assoc_migrate / sctp_hash_obj

  write to 0xffff8880b67c0020 of 8 bytes by task 18908 on cpu 1:
    sctp_assoc_migrate+0x1a6/0x290 net/sctp/associola.c:1091
    sctp_sock_migrate+0x8aa/0x9b0 net/sctp/socket.c:9465
    sctp_accept+0x3c8/0x470 net/sctp/socket.c:4916
    inet_accept+0x7f/0x360 net/ipv4/af_inet.c:734
    __sys_accept4+0x224/0x430 net/socket.c:1754
    __do_sys_accept net/socket.c:1795 [inline]
    __se_sys_accept net/socket.c:1792 [inline]
    __x64_sys_accept+0x4e/0x60 net/socket.c:1792
    do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290
    entry_SYSCALL_64_after_hwframe+0x44/0xa9

  read to 0xffff8880b67c0020 of 8 bytes by task 12003 on cpu 0:
    sctp_hash_obj+0x4f/0x2d0 net/sctp/input.c:894
    rht_key_get_hash include/linux/rhashtable.h:133 [inline]
    rht_key_hashfn include/linux/rhashtable.h:159 [inline]
    rht_head_hashfn include/linux/rhashtable.h:174 [inline]
    head_hashfn lib/rhashtable.c:41 [inline]
    rhashtable_rehash_one lib/rhashtable.c:245 [inline]
    rhashtable_rehash_chain lib/rhashtable.c:276 [inline]
    rhashtable_rehash_table lib/rhashtable.c:316 [inline]
    rht_deferred_worker+0x468/0xab0 lib/rhashtable.c:420
    process_one_work+0x3d4/0x890 kernel/workqueue.c:2269
    worker_thread+0xa0/0x800 kernel/workqueue.c:2415
    kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253
    ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352

It was caused by rhashtable access asoc->base.sk when sctp_assoc_migrate
is changing its value. However, what rhashtable wants is netns from asoc
base.sk, and for an asoc, its netns won't change once set. So we can
simply fix it by caching netns since created.

Fixes: d6c0256a60e6 ("sctp: add the rhashtable apis for sctp global transport hashtable")
Reported-by: syzbot+e3b35fe7918ff0ee474e@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/sctp/structs.h | 3 +++
 net/sctp/associola.c       | 1 +
 net/sctp/endpointola.c     | 1 +
 net/sctp/input.c           | 4 ++--
 4 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 503fbc3cd819..2b6f3f13d5bc 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1239,6 +1239,9 @@ struct sctp_ep_common {
 	/* What socket does this endpoint belong to?  */
 	struct sock *sk;
 
+	/* Cache netns and it won't change once set */
+	struct net *net;
+
 	/* This is where we receive inbound chunks.  */
 	struct sctp_inq	  inqueue;
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index d2ffc9a0ba3a..41839b85c268 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -64,6 +64,7 @@ static struct sctp_association *sctp_association_init(
 	/* Discarding const is appropriate here.  */
 	asoc->ep = (struct sctp_endpoint *)ep;
 	asoc->base.sk = (struct sock *)sk;
+	asoc->base.net = sock_net(sk);
 
 	sctp_endpoint_hold(asoc->ep);
 	sock_hold(asoc->base.sk);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index ea53049d1db6..3067deb0fbec 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -110,6 +110,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 
 	/* Remember who we are attached to.  */
 	ep->base.sk = sk;
+	ep->base.net = sock_net(sk);
 	sock_hold(ep->base.sk);
 
 	return ep;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 2277981559d0..4d2bcfc9d7f8 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -882,7 +882,7 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
 	if (!sctp_transport_hold(t))
 		return err;
 
-	if (!net_eq(sock_net(t->asoc->base.sk), x->net))
+	if (!net_eq(t->asoc->base.net, x->net))
 		goto out;
 	if (x->lport != htons(t->asoc->base.bind_addr.port))
 		goto out;
@@ -897,7 +897,7 @@ static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
 {
 	const struct sctp_transport *t = data;
 
-	return sctp_hashfn(sock_net(t->asoc->base.sk),
+	return sctp_hashfn(t->asoc->base.net,
 			   htons(t->asoc->base.bind_addr.port),
 			   &t->ipaddr, seed);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 4b3da77b72ad6b3c48c6fe4a395ace7db39a12c5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 22 Nov 2019 21:07:54 +0100
Subject: bpf, x86: Generalize and extend bpf_arch_text_poke for direct jumps

Add BPF_MOD_{NOP_TO_JUMP,JUMP_TO_JUMP,JUMP_TO_NOP} patching for x86
JIT in order to be able to patch direct jumps or nop them out. We need
this facility in order to patch tail call jumps and in later work also
BPF static keys.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/aa4784196a8e5e985af4b30a4fe5336bce6e9643.1574452833.git.daniel@iogearbox.net
---
 arch/x86/net/bpf_jit_comp.c | 64 ++++++++++++++++++++++++++++++++-------------
 include/linux/bpf.h         |  6 +++++
 2 files changed, 52 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2e586f579945..f438bd3b7689 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -203,8 +203,9 @@ struct jit_context {
 /* Maximum number of bytes emitted while JITing one eBPF insn */
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
-/* number of bytes emit_call() needs to generate call instruction */
-#define X86_CALL_SIZE		5
+
+/* Number of bytes emit_patch() needs to generate instructions */
+#define X86_PATCH_SIZE		5
 
 #define PROLOGUE_SIZE		25
 
@@ -215,7 +216,7 @@ struct jit_context {
 static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
 {
 	u8 *prog = *pprog;
-	int cnt = X86_CALL_SIZE;
+	int cnt = X86_PATCH_SIZE;
 
 	/* BPF trampoline can be made to work without these nops,
 	 * but let's waste 5 bytes for now and optimize later
@@ -480,64 +481,91 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
 	*pprog = prog;
 }
 
-static int emit_call(u8 **pprog, void *func, void *ip)
+static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
 {
 	u8 *prog = *pprog;
 	int cnt = 0;
 	s64 offset;
 
-	offset = func - (ip + X86_CALL_SIZE);
+	offset = func - (ip + X86_PATCH_SIZE);
 	if (!is_simm32(offset)) {
 		pr_err("Target call %p is out of range\n", func);
 		return -EINVAL;
 	}
-	EMIT1_off32(0xE8, offset);
+	EMIT1_off32(opcode, offset);
 	*pprog = prog;
 	return 0;
 }
 
+static int emit_call(u8 **pprog, void *func, void *ip)
+{
+	return emit_patch(pprog, func, ip, 0xE8);
+}
+
+static int emit_jump(u8 **pprog, void *func, void *ip)
+{
+	return emit_patch(pprog, func, ip, 0xE9);
+}
+
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *old_addr, void *new_addr)
 {
-	u8 old_insn[X86_CALL_SIZE] = {};
-	u8 new_insn[X86_CALL_SIZE] = {};
+	int (*emit_patch_fn)(u8 **pprog, void *func, void *ip);
+	u8 old_insn[X86_PATCH_SIZE] = {};
+	u8 new_insn[X86_PATCH_SIZE] = {};
 	u8 *prog;
 	int ret;
 
 	if (!is_kernel_text((long)ip) &&
 	    !is_bpf_text_address((long)ip))
-		/* BPF trampoline in modules is not supported */
+		/* BPF poking in modules is not supported */
 		return -EINVAL;
 
+	switch (t) {
+	case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP:
+		emit_patch_fn = emit_call;
+		break;
+	case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP:
+		emit_patch_fn = emit_jump;
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+
 	if (old_addr) {
 		prog = old_insn;
-		ret = emit_call(&prog, old_addr, (void *)ip);
+		ret = emit_patch_fn(&prog, old_addr, (void *)ip);
 		if (ret)
 			return ret;
 	}
 	if (new_addr) {
 		prog = new_insn;
-		ret = emit_call(&prog, new_addr, (void *)ip);
+		ret = emit_patch_fn(&prog, new_addr, (void *)ip);
 		if (ret)
 			return ret;
 	}
+
 	ret = -EBUSY;
 	mutex_lock(&text_mutex);
 	switch (t) {
 	case BPF_MOD_NOP_TO_CALL:
-		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
+	case BPF_MOD_NOP_TO_JUMP:
+		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE))
 			goto out;
-		text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
+		text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
 		break;
 	case BPF_MOD_CALL_TO_CALL:
-		if (memcmp(ip, old_insn, X86_CALL_SIZE))
+	case BPF_MOD_JUMP_TO_JUMP:
+		if (memcmp(ip, old_insn, X86_PATCH_SIZE))
 			goto out;
-		text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
+		text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
 		break;
 	case BPF_MOD_CALL_TO_NOP:
-		if (memcmp(ip, old_insn, X86_CALL_SIZE))
+	case BPF_MOD_JUMP_TO_NOP:
+		if (memcmp(ip, old_insn, X86_PATCH_SIZE))
 			goto out;
-		text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
+		text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE,
+			     NULL);
 		break;
 	}
 	ret = 0;
@@ -1394,7 +1422,7 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags
 		/* skip patched call instruction and point orig_call to actual
 		 * body of the kernel function.
 		 */
-		orig_call += X86_CALL_SIZE;
+		orig_call += X86_PATCH_SIZE;
 
 	prog = image;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e89e86122233..7978b617caa8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1284,10 +1284,16 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 #endif /* CONFIG_INET */
 
 enum bpf_text_poke_type {
+	/* All call-related pokes. */
 	BPF_MOD_NOP_TO_CALL,
 	BPF_MOD_CALL_TO_CALL,
 	BPF_MOD_CALL_TO_NOP,
+	/* All jump-related pokes. */
+	BPF_MOD_NOP_TO_JUMP,
+	BPF_MOD_JUMP_TO_JUMP,
+	BPF_MOD_JUMP_TO_NOP,
 };
+
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
-- 
cgit v1.2.3-59-g8ed1b


From 6332be04c039a72fca32ed0a4265bac58d606bb6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 22 Nov 2019 21:07:55 +0100
Subject: bpf: Move bpf_free_used_maps into sleepable section

We later on are going to need a sleepable context as opposed to plain
RCU callback in order to untrack programs we need to poke at runtime
and tracking as well as image update is performed under mutex.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/09823b1d5262876e9b83a8e75df04cf0467357a4.1574452833.git.daniel@iogearbox.net
---
 include/linux/bpf.h  |  4 ++++
 kernel/bpf/core.c    | 23 +++++++++++++++++++++++
 kernel/bpf/syscall.c | 20 --------------------
 3 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7978b617caa8..561b920f0bf7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1031,6 +1031,10 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 {
 	return -ENOTSUPP;
 }
+
+static inline void bpf_map_put(struct bpf_map *map)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b5945c3aaa8e..0e825c164f1a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2003,12 +2003,35 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 								     : 0;
 }
 
+static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
+{
+	enum bpf_cgroup_storage_type stype;
+
+	for_each_cgroup_storage_type(stype) {
+		if (!aux->cgroup_storage[stype])
+			continue;
+		bpf_cgroup_storage_release(aux->prog,
+					   aux->cgroup_storage[stype]);
+	}
+}
+
+static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+{
+	int i;
+
+	bpf_free_cgroup_storage(aux);
+	for (i = 0; i < aux->used_map_cnt; i++)
+		bpf_map_put(aux->used_maps[i]);
+	kfree(aux->used_maps);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
 	int i;
 
 	aux = container_of(work, struct bpf_prog_aux, work);
+	bpf_free_used_maps(aux);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ae52eb05f41..373778da8489 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1302,25 +1302,6 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	return 0;
 }
 
-/* drop refcnt on maps used by eBPF program and free auxilary data */
-static void free_used_maps(struct bpf_prog_aux *aux)
-{
-	enum bpf_cgroup_storage_type stype;
-	int i;
-
-	for_each_cgroup_storage_type(stype) {
-		if (!aux->cgroup_storage[stype])
-			continue;
-		bpf_cgroup_storage_release(aux->prog,
-					   aux->cgroup_storage[stype]);
-	}
-
-	for (i = 0; i < aux->used_map_cnt; i++)
-		bpf_map_put(aux->used_maps[i]);
-
-	kfree(aux->used_maps);
-}
-
 int __bpf_prog_charge(struct user_struct *user, u32 pages)
 {
 	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1415,7 +1396,6 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 	kvfree(aux->func_info);
 	kfree(aux->func_info_aux);
-	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
 	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
-- 
cgit v1.2.3-59-g8ed1b


From 2beee5f57441413b64a9c2bd657e17beabb98d1c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 22 Nov 2019 21:07:56 +0100
Subject: bpf: Move owner type, jited info into array auxiliary data

We're going to extend this with further information which is only
relevant for prog array at this point. Given this info is not used
in critical path, move it into its own structure such that the main
array map structure can be kept on diet.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/b9ddccdb0f6f7026489ee955f16c96381e1e7238.1574452833.git.daniel@iogearbox.net
---
 include/linux/bpf.h     | 18 +++++++++++-------
 kernel/bpf/arraymap.c   | 32 ++++++++++++++++++++++++++++++--
 kernel/bpf/core.c       | 11 +++++------
 kernel/bpf/map_in_map.c |  5 ++---
 kernel/bpf/syscall.c    | 16 ++++++----------
 5 files changed, 54 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 561b920f0bf7..c3b29061284e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -560,17 +560,21 @@ struct bpf_prog_aux {
 	};
 };
 
+struct bpf_array_aux {
+	/* 'Ownership' of prog array is claimed by the first program that
+	 * is going to use this map or by the first program which FD is
+	 * stored in the map to make sure that all callers and callees have
+	 * the same prog type and JITed flag.
+	 */
+	enum bpf_prog_type type;
+	bool jited;
+};
+
 struct bpf_array {
 	struct bpf_map map;
 	u32 elem_size;
 	u32 index_mask;
-	/* 'ownership' of prog_array is claimed by the first program that
-	 * is going to use this map or by the first program which FD is stored
-	 * in the map to make sure that all callers and callees have the same
-	 * prog_type and JITed flag
-	 */
-	enum bpf_prog_type owner_prog_type;
-	bool owner_jited;
+	struct bpf_array_aux *aux;
 	union {
 		char value[0] __aligned(8);
 		void *ptrs[0] __aligned(8);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633c8c701ff6..57da950ee55b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -671,10 +671,38 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_array_aux *aux;
+	struct bpf_map *map;
+
+	aux = kzalloc(sizeof(*aux), GFP_KERNEL);
+	if (!aux)
+		return ERR_PTR(-ENOMEM);
+
+	map = array_map_alloc(attr);
+	if (IS_ERR(map)) {
+		kfree(aux);
+		return map;
+	}
+
+	container_of(map, struct bpf_array, map)->aux = aux;
+	return map;
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+	struct bpf_array_aux *aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	kfree(aux);
+	fd_array_map_free(map);
+}
+
 const struct bpf_map_ops prog_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
-	.map_alloc = array_map_alloc,
-	.map_free = fd_array_map_free,
+	.map_alloc = prog_array_map_alloc,
+	.map_free = prog_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0e825c164f1a..07af9c1d9cf1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1691,18 +1691,17 @@ bool bpf_prog_array_compatible(struct bpf_array *array,
 	if (fp->kprobe_override)
 		return false;
 
-	if (!array->owner_prog_type) {
+	if (!array->aux->type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
 		 */
-		array->owner_prog_type = fp->type;
-		array->owner_jited = fp->jited;
-
+		array->aux->type  = fp->type;
+		array->aux->jited = fp->jited;
 		return true;
 	}
 
-	return array->owner_prog_type == fp->type &&
-	       array->owner_jited == fp->jited;
+	return array->aux->type  == fp->type &&
+	       array->aux->jited == fp->jited;
 }
 
 static int bpf_check_tail_call(const struct bpf_prog *fp)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 4cbe987be35b..5e9366b33f0f 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -17,9 +17,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	if (IS_ERR(inner_map))
 		return inner_map;
 
-	/* prog_array->owner_prog_type and owner_jited
-	 * is a runtime binding.  Doing static check alone
-	 * in the verifier is not enough.
+	/* prog_array->aux->{type,jited} is a runtime binding.
+	 * Doing static check alone in the verifier is not enough.
 	 */
 	if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
 	    inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 373778da8489..b904d56ec686 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -389,13 +389,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 {
 	const struct bpf_map *map = filp->private_data;
 	const struct bpf_array *array;
-	u32 owner_prog_type = 0;
-	u32 owner_jited = 0;
+	u32 type = 0, jited = 0;
 
 	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
 		array = container_of(map, struct bpf_array, map);
-		owner_prog_type = array->owner_prog_type;
-		owner_jited = array->owner_jited;
+		type  = array->aux->type;
+		jited = array->aux->jited;
 	}
 
 	seq_printf(m,
@@ -415,12 +414,9 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   map->memory.pages * 1ULL << PAGE_SHIFT,
 		   map->id,
 		   READ_ONCE(map->frozen));
-
-	if (owner_prog_type) {
-		seq_printf(m, "owner_prog_type:\t%u\n",
-			   owner_prog_type);
-		seq_printf(m, "owner_jited:\t%u\n",
-			   owner_jited);
+	if (type) {
+		seq_printf(m, "owner_prog_type:\t%u\n", type);
+		seq_printf(m, "owner_jited:\t%u\n", jited);
 	}
 }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From a66886fe6c24ebeeb6dc10fbd9b75158029eacf7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 22 Nov 2019 21:07:57 +0100
Subject: bpf: Add initial poke descriptor table for jit images

Add initial poke table data structures and management to the BPF
prog that can later be used by JITs. Also add an instance of poke
specific data for tail call maps; plan for later work is to extend
this also for BPF static keys.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/1db285ec2ea4207ee0455b3f8e191a4fc58b9ade.1574452833.git.daniel@iogearbox.net
---
 include/linux/bpf.h    | 20 ++++++++++++++++++++
 include/linux/filter.h | 10 ++++++++++
 kernel/bpf/core.c      | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3b29061284e..312983bf7faa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -488,6 +488,24 @@ struct bpf_func_info_aux {
 	bool unreliable;
 };
 
+enum bpf_jit_poke_reason {
+	BPF_POKE_REASON_TAIL_CALL,
+};
+
+/* Descriptor of pokes pointing /into/ the JITed image. */
+struct bpf_jit_poke_descriptor {
+	void *ip;
+	union {
+		struct {
+			struct bpf_map *map;
+			u32 key;
+		} tail_call;
+	};
+	bool ip_stable;
+	u8 adj_off;
+	u16 reason;
+};
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -513,6 +531,8 @@ struct bpf_prog_aux {
 	const char *attach_func_name;
 	struct bpf_prog **func;
 	void *jit_data; /* JIT specific data. arch dependent */
+	struct bpf_jit_poke_descriptor *poke_tab;
+	u32 size_poke_tab;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
 	const struct bpf_prog_ops *ops;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ad80e9c6111c..796b60d8cc6c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -952,6 +952,9 @@ void *bpf_jit_alloc_exec(unsigned long size);
 void bpf_jit_free_exec(void *addr);
 void bpf_jit_free(struct bpf_prog *fp);
 
+int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+				struct bpf_jit_poke_descriptor *poke);
+
 int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 			  const struct bpf_insn *insn, bool extra_pass,
 			  u64 *func_addr, bool *func_addr_fixed);
@@ -1055,6 +1058,13 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
 	return false;
 }
 
+static inline int
+bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+			    struct bpf_jit_poke_descriptor *poke)
+{
+	return -ENOTSUPP;
+}
+
 static inline void bpf_jit_free(struct bpf_prog *fp)
 {
 	bpf_prog_unlock_free(fp);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 07af9c1d9cf1..608b7085e0c9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -256,6 +256,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
 {
 	if (fp->aux) {
 		free_percpu(fp->aux->stats);
+		kfree(fp->aux->poke_tab);
 		kfree(fp->aux);
 	}
 	vfree(fp);
@@ -756,6 +757,39 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 	return ret;
 }
 
+int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+				struct bpf_jit_poke_descriptor *poke)
+{
+	struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
+	static const u32 poke_tab_max = 1024;
+	u32 slot = prog->aux->size_poke_tab;
+	u32 size = slot + 1;
+
+	if (size > poke_tab_max)
+		return -ENOSPC;
+	if (poke->ip || poke->ip_stable || poke->adj_off)
+		return -EINVAL;
+
+	switch (poke->reason) {
+	case BPF_POKE_REASON_TAIL_CALL:
+		if (!poke->tail_call.map)
+			return -EINVAL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	tab = krealloc(tab, size * sizeof(*poke), GFP_KERNEL);
+	if (!tab)
+		return -ENOMEM;
+
+	memcpy(&tab[slot], poke, sizeof(*poke));
+	prog->aux->size_poke_tab = size;
+	prog->aux->poke_tab = tab;
+
+	return slot;
+}
+
 static atomic_long_t bpf_jit_current;
 
 /* Can be overridden by an arch's JIT compiler if it has a custom,
-- 
cgit v1.2.3-59-g8ed1b


From da765a2f599304a81a25e77908d1790414ecdbb6 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 22 Nov 2019 21:07:58 +0100
Subject: bpf: Add poke dependency tracking for prog array maps

This work adds program tracking to prog array maps. This is needed such
that upon prog array updates/deletions we can fix up all programs which
make use of this tail call map. We add ops->map_poke_{un,}track()
helpers to maps to maintain the list of programs and ops->map_poke_run()
for triggering the actual update.

bpf_array_aux is extended to contain the list head and poke_mutex in
order to serialize program patching during updates/deletions.
bpf_free_used_maps() will untrack the program shortly before dropping
the reference to the map. For clearing out the prog array once all urefs
are dropped we need to use schedule_work() to have a sleepable context.

The prog_array_map_poke_run() is triggered during updates/deletions and
walks the maintained prog list. It checks in their poke_tabs whether the
map and key is matching and runs the actual bpf_arch_text_poke() for
patching in the nop or new jmp location. Depending on the type of update,
we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net
---
 include/linux/bpf.h   |  12 ++++
 kernel/bpf/arraymap.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/core.c     |   9 ++-
 kernel/bpf/syscall.c  |  20 ++++--
 4 files changed, 212 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 312983bf7faa..c2f07fd410c1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,6 +22,7 @@ struct bpf_verifier_env;
 struct bpf_verifier_log;
 struct perf_event;
 struct bpf_prog;
+struct bpf_prog_aux;
 struct bpf_map;
 struct sock;
 struct seq_file;
@@ -64,6 +65,12 @@ struct bpf_map_ops {
 			     const struct btf_type *key_type,
 			     const struct btf_type *value_type);
 
+	/* Prog poke tracking helpers. */
+	int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
+	void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
+	void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
+			     struct bpf_prog *new);
+
 	/* Direct value access helpers. */
 	int (*map_direct_value_addr)(const struct bpf_map *map,
 				     u64 *imm, u32 off);
@@ -588,6 +595,11 @@ struct bpf_array_aux {
 	 */
 	enum bpf_prog_type type;
 	bool jited;
+	/* Programs with direct jumps into programs part of this array. */
+	struct list_head poke_progs;
+	struct bpf_map *map;
+	struct mutex poke_mutex;
+	struct work_struct work;
 };
 
 struct bpf_array {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 57da950ee55b..58bdf5fd24cc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 	if (IS_ERR(new_ptr))
 		return PTR_ERR(new_ptr);
 
-	old_ptr = xchg(array->ptrs + index, new_ptr);
+	if (map->ops->map_poke_run) {
+		mutex_lock(&array->aux->poke_mutex);
+		old_ptr = xchg(array->ptrs + index, new_ptr);
+		map->ops->map_poke_run(map, index, old_ptr, new_ptr);
+		mutex_unlock(&array->aux->poke_mutex);
+	} else {
+		old_ptr = xchg(array->ptrs + index, new_ptr);
+	}
+
 	if (old_ptr)
 		map->ops->map_fd_put_ptr(old_ptr);
-
 	return 0;
 }
 
@@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 	if (index >= array->map.max_entries)
 		return -E2BIG;
 
-	old_ptr = xchg(array->ptrs + index, NULL);
+	if (map->ops->map_poke_run) {
+		mutex_lock(&array->aux->poke_mutex);
+		old_ptr = xchg(array->ptrs + index, NULL);
+		map->ops->map_poke_run(map, index, old_ptr, NULL);
+		mutex_unlock(&array->aux->poke_mutex);
+	} else {
+		old_ptr = xchg(array->ptrs + index, NULL);
+	}
+
 	if (old_ptr) {
 		map->ops->map_fd_put_ptr(old_ptr);
 		return 0;
@@ -671,6 +686,152 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
 	rcu_read_unlock();
 }
 
+struct prog_poke_elem {
+	struct list_head list;
+	struct bpf_prog_aux *aux;
+};
+
+static int prog_array_map_poke_track(struct bpf_map *map,
+				     struct bpf_prog_aux *prog_aux)
+{
+	struct prog_poke_elem *elem;
+	struct bpf_array_aux *aux;
+	int ret = 0;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	mutex_lock(&aux->poke_mutex);
+	list_for_each_entry(elem, &aux->poke_progs, list) {
+		if (elem->aux == prog_aux)
+			goto out;
+	}
+
+	elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+	if (!elem) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&elem->list);
+	/* We must track the program's aux info at this point in time
+	 * since the program pointer itself may not be stable yet, see
+	 * also comment in prog_array_map_poke_run().
+	 */
+	elem->aux = prog_aux;
+
+	list_add_tail(&elem->list, &aux->poke_progs);
+out:
+	mutex_unlock(&aux->poke_mutex);
+	return ret;
+}
+
+static void prog_array_map_poke_untrack(struct bpf_map *map,
+					struct bpf_prog_aux *prog_aux)
+{
+	struct prog_poke_elem *elem, *tmp;
+	struct bpf_array_aux *aux;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	mutex_lock(&aux->poke_mutex);
+	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+		if (elem->aux == prog_aux) {
+			list_del_init(&elem->list);
+			kfree(elem);
+			break;
+		}
+	}
+	mutex_unlock(&aux->poke_mutex);
+}
+
+static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
+				    struct bpf_prog *old,
+				    struct bpf_prog *new)
+{
+	enum bpf_text_poke_type type;
+	struct prog_poke_elem *elem;
+	struct bpf_array_aux *aux;
+
+	if (!old && new)
+		type = BPF_MOD_NOP_TO_JUMP;
+	else if (old && !new)
+		type = BPF_MOD_JUMP_TO_NOP;
+	else if (old && new)
+		type = BPF_MOD_JUMP_TO_JUMP;
+	else
+		return;
+
+	aux = container_of(map, struct bpf_array, map)->aux;
+	WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
+
+	list_for_each_entry(elem, &aux->poke_progs, list) {
+		struct bpf_jit_poke_descriptor *poke;
+		int i, ret;
+
+		for (i = 0; i < elem->aux->size_poke_tab; i++) {
+			poke = &elem->aux->poke_tab[i];
+
+			/* Few things to be aware of:
+			 *
+			 * 1) We can only ever access aux in this context, but
+			 *    not aux->prog since it might not be stable yet and
+			 *    there could be danger of use after free otherwise.
+			 * 2) Initially when we start tracking aux, the program
+			 *    is not JITed yet and also does not have a kallsyms
+			 *    entry. We skip these as poke->ip_stable is not
+			 *    active yet. The JIT will do the final fixup before
+			 *    setting it stable. The various poke->ip_stable are
+			 *    successively activated, so tail call updates can
+			 *    arrive from here while JIT is still finishing its
+			 *    final fixup for non-activated poke entries.
+			 * 3) On program teardown, the program's kallsym entry gets
+			 *    removed out of RCU callback, but we can only untrack
+			 *    from sleepable context, therefore bpf_arch_text_poke()
+			 *    might not see that this is in BPF text section and
+			 *    bails out with -EINVAL. As these are unreachable since
+			 *    RCU grace period already passed, we simply skip them.
+			 * 4) Also programs reaching refcount of zero while patching
+			 *    is in progress is okay since we're protected under
+			 *    poke_mutex and untrack the programs before the JIT
+			 *    buffer is freed. When we're still in the middle of
+			 *    patching and suddenly kallsyms entry of the program
+			 *    gets evicted, we just skip the rest which is fine due
+			 *    to point 3).
+			 * 5) Any other error happening below from bpf_arch_text_poke()
+			 *    is a unexpected bug.
+			 */
+			if (!READ_ONCE(poke->ip_stable))
+				continue;
+			if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+				continue;
+			if (poke->tail_call.map != map ||
+			    poke->tail_call.key != key)
+				continue;
+
+			ret = bpf_arch_text_poke(poke->ip, type,
+						 old ? (u8 *)old->bpf_func +
+						 poke->adj_off : NULL,
+						 new ? (u8 *)new->bpf_func +
+						 poke->adj_off : NULL);
+			BUG_ON(ret < 0 && ret != -EINVAL);
+		}
+	}
+}
+
+static void prog_array_map_clear_deferred(struct work_struct *work)
+{
+	struct bpf_map *map = container_of(work, struct bpf_array_aux,
+					   work)->map;
+	bpf_fd_array_map_clear(map);
+	bpf_map_put(map);
+}
+
+static void prog_array_map_clear(struct bpf_map *map)
+{
+	struct bpf_array_aux *aux = container_of(map, struct bpf_array,
+						 map)->aux;
+	bpf_map_inc(map);
+	schedule_work(&aux->work);
+}
+
 static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_array_aux *aux;
@@ -680,6 +841,10 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 	if (!aux)
 		return ERR_PTR(-ENOMEM);
 
+	INIT_WORK(&aux->work, prog_array_map_clear_deferred);
+	INIT_LIST_HEAD(&aux->poke_progs);
+	mutex_init(&aux->poke_mutex);
+
 	map = array_map_alloc(attr);
 	if (IS_ERR(map)) {
 		kfree(aux);
@@ -687,14 +852,21 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
 	}
 
 	container_of(map, struct bpf_array, map)->aux = aux;
+	aux->map = map;
+
 	return map;
 }
 
 static void prog_array_map_free(struct bpf_map *map)
 {
+	struct prog_poke_elem *elem, *tmp;
 	struct bpf_array_aux *aux;
 
 	aux = container_of(map, struct bpf_array, map)->aux;
+	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+		list_del_init(&elem->list);
+		kfree(elem);
+	}
 	kfree(aux);
 	fd_array_map_free(map);
 }
@@ -703,13 +875,16 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_alloc_check = fd_array_map_alloc_check,
 	.map_alloc = prog_array_map_alloc,
 	.map_free = prog_array_map_free,
+	.map_poke_track = prog_array_map_poke_track,
+	.map_poke_untrack = prog_array_map_poke_untrack,
+	.map_poke_run = prog_array_map_poke_run,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
-	.map_release_uref = bpf_fd_array_map_clear,
+	.map_release_uref = prog_array_map_clear,
 	.map_seq_show_elem = prog_array_map_seq_show_elem,
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 608b7085e0c9..49e32acad7d8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2050,11 +2050,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
 
 static void bpf_free_used_maps(struct bpf_prog_aux *aux)
 {
+	struct bpf_map *map;
 	int i;
 
 	bpf_free_cgroup_storage(aux);
-	for (i = 0; i < aux->used_map_cnt; i++)
-		bpf_map_put(aux->used_maps[i]);
+	for (i = 0; i < aux->used_map_cnt; i++) {
+		map = aux->used_maps[i];
+		if (map->ops->map_poke_untrack)
+			map->ops->map_poke_untrack(map, aux);
+		bpf_map_put(map);
+	}
 	kfree(aux->used_maps);
 }
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b904d56ec686..e3461ec59570 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -25,12 +25,13 @@
 #include <linux/nospec.h>
 #include <uapi/linux/btf.h>
 
-#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
-			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
-			   (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
-			   (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+			  (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
-#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
+			IS_FD_HASH(map))
 
 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
 
@@ -877,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_cgroup_storage_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 		err = bpf_stackmap_copy(map, key, value);
-	} else if (IS_FD_ARRAY(map)) {
+	} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
 		err = bpf_fd_array_map_lookup_elem(map, key, value);
 	} else if (IS_FD_HASH(map)) {
 		err = bpf_fd_htab_map_lookup_elem(map, key, value);
@@ -1004,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr)
 		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
 		goto out;
+	} else if (IS_FD_PROG_ARRAY(map)) {
+		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+						   attr->flags);
+		goto out;
 	}
 
 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
@@ -1086,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (bpf_map_is_dev_bound(map)) {
 		err = bpf_map_offload_delete_elem(map, key);
 		goto out;
+	} else if (IS_FD_PROG_ARRAY(map)) {
+		err = map->ops->map_delete_elem(map, key);
+		goto out;
 	}
 
 	preempt_disable();
-- 
cgit v1.2.3-59-g8ed1b


From d2e4c1e6c2947269346054ac8937ccfe9e0bcc6b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 22 Nov 2019 21:07:59 +0100
Subject: bpf: Constant map key tracking for prog array pokes

Add tracking of constant keys into tail call maps. The signature of
bpf_tail_call_proto is that arg1 is ctx, arg2 map pointer and arg3
is a index key. The direct call approach for tail calls can be enabled
if the verifier asserted that for all branches leading to the tail call
helper invocation, the map pointer and index key were both constant
and the same.

Tracking of map pointers we already do from prior work via c93552c443eb
("bpf: properly enforce index mask to prevent out-of-bounds speculation")
and 09772d92cd5a ("bpf: avoid retpoline for lookup/update/ delete calls
on maps").

Given the tail call map index key is not on stack but directly in the
register, we can add similar tracking approach and later in fixup_bpf_calls()
add a poke descriptor to the progs poke_tab with the relevant information
for the JITing phase.

We internally reuse insn->imm for the rewritten BPF_JMP | BPF_TAIL_CALL
instruction in order to point into the prog's poke_tab, and keep insn->imm
as 0 as indicator that current indirect tail call emission must be used.
Note that publishing to the tracker must happen at the end of fixup_bpf_calls()
since adding elements to the poke_tab reallocates its memory, so we need
to wait until its in final state.

Future work can generalize and add similar approach to optimize plain
array map lookups. Difference there is that we need to look into the key
value that sits on stack. For clarity in bpf_insn_aux_data, map_state
has been renamed into map_ptr_state, so we get map_{ptr,key}_state as
trackers.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/e8db37f6b2ae60402fa40216c96738ee9b316c32.1574452833.git.daniel@iogearbox.net
---
 include/linux/bpf_verifier.h |   3 +-
 kernel/bpf/verifier.c        | 120 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 113 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index cdd08bf0ec06..26e40de9ef55 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -293,7 +293,7 @@ struct bpf_verifier_state_list {
 struct bpf_insn_aux_data {
 	union {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
-		unsigned long map_state;	/* pointer/poison value for maps */
+		unsigned long map_ptr_state;	/* pointer/poison value for maps */
 		s32 call_imm;			/* saved imm field of call insn */
 		u32 alu_limit;			/* limit for add/sub register with pointer */
 		struct {
@@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
 			u32 map_off;		/* offset from value base address */
 		};
 	};
+	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	int sanitize_stack_off; /* stack slot to be cleared */
 	bool seen; /* this insn was processed by the verifier */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fc85714428c7..a0482e1c4a77 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -171,6 +171,9 @@ struct bpf_verifier_stack_elem {
 #define BPF_COMPLEXITY_LIMIT_JMP_SEQ	8192
 #define BPF_COMPLEXITY_LIMIT_STATES	64
 
+#define BPF_MAP_KEY_POISON	(1ULL << 63)
+#define BPF_MAP_KEY_SEEN	(1ULL << 62)
+
 #define BPF_MAP_PTR_UNPRIV	1UL
 #define BPF_MAP_PTR_POISON	((void *)((0xeB9FUL << 1) +	\
 					  POISON_POINTER_DELTA))
@@ -178,12 +181,12 @@ struct bpf_verifier_stack_elem {
 
 static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
 {
-	return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
+	return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
 }
 
 static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
 {
-	return aux->map_state & BPF_MAP_PTR_UNPRIV;
+	return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
 }
 
 static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
@@ -191,8 +194,31 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
 {
 	BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
 	unpriv |= bpf_map_ptr_unpriv(aux);
-	aux->map_state = (unsigned long)map |
-			 (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+	aux->map_ptr_state = (unsigned long)map |
+			     (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+}
+
+static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
+{
+	return aux->map_key_state & BPF_MAP_KEY_POISON;
+}
+
+static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux)
+{
+	return !(aux->map_key_state & BPF_MAP_KEY_SEEN);
+}
+
+static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux)
+{
+	return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON);
+}
+
+static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state)
+{
+	bool poisoned = bpf_map_key_poisoned(aux);
+
+	aux->map_key_state = state | BPF_MAP_KEY_SEEN |
+			     (poisoned ? BPF_MAP_KEY_POISON : 0ULL);
 }
 
 struct bpf_call_arg_meta {
@@ -4090,15 +4116,49 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 		return -EACCES;
 	}
 
-	if (!BPF_MAP_PTR(aux->map_state))
+	if (!BPF_MAP_PTR(aux->map_ptr_state))
 		bpf_map_ptr_store(aux, meta->map_ptr,
 				  meta->map_ptr->unpriv_array);
-	else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
+	else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
 		bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
 				  meta->map_ptr->unpriv_array);
 	return 0;
 }
 
+static int
+record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+		int func_id, int insn_idx)
+{
+	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+	struct bpf_reg_state *regs = cur_regs(env), *reg;
+	struct bpf_map *map = meta->map_ptr;
+	struct tnum range;
+	u64 val;
+
+	if (func_id != BPF_FUNC_tail_call)
+		return 0;
+	if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
+		verbose(env, "kernel subsystem misconfigured verifier\n");
+		return -EINVAL;
+	}
+
+	range = tnum_range(0, map->max_entries - 1);
+	reg = &regs[BPF_REG_3];
+
+	if (!register_is_const(reg) || !tnum_in(range, reg->var_off)) {
+		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
+		return 0;
+	}
+
+	val = reg->var_off.value;
+	if (bpf_map_key_unseen(aux))
+		bpf_map_key_store(aux, val);
+	else if (!bpf_map_key_poisoned(aux) &&
+		  bpf_map_key_immediate(aux) != val)
+		bpf_map_key_store(aux, BPF_MAP_KEY_POISON);
+	return 0;
+}
+
 static int check_reference_leak(struct bpf_verifier_env *env)
 {
 	struct bpf_func_state *state = cur_func(env);
@@ -4173,6 +4233,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	err = record_func_key(env, &meta, func_id, insn_idx);
+	if (err)
+		return err;
+
 	/* Mark slots with STACK_MISC in case of raw mode, stack offset
 	 * is inferred from register state.
 	 */
@@ -9065,6 +9129,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 static int fixup_bpf_calls(struct bpf_verifier_env *env)
 {
 	struct bpf_prog *prog = env->prog;
+	bool expect_blinding = bpf_jit_blinding_enabled(prog);
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
 	const int insn_cnt = prog->len;
@@ -9073,7 +9138,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
 	struct bpf_map *map_ptr;
-	int i, cnt, delta = 0;
+	int i, ret, cnt, delta = 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
@@ -9217,6 +9282,26 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
 
 			aux = &env->insn_aux_data[i + delta];
+			if (prog->jit_requested && !expect_blinding &&
+			    !bpf_map_key_poisoned(aux) &&
+			    !bpf_map_ptr_poisoned(aux) &&
+			    !bpf_map_ptr_unpriv(aux)) {
+				struct bpf_jit_poke_descriptor desc = {
+					.reason = BPF_POKE_REASON_TAIL_CALL,
+					.tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
+					.tail_call.key = bpf_map_key_immediate(aux),
+				};
+
+				ret = bpf_jit_add_poke_descriptor(prog, &desc);
+				if (ret < 0) {
+					verbose(env, "adding tail call poke descriptor failed\n");
+					return ret;
+				}
+
+				insn->imm = ret + 1;
+				continue;
+			}
+
 			if (!bpf_map_ptr_unpriv(aux))
 				continue;
 
@@ -9231,7 +9316,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 				return -EINVAL;
 			}
 
-			map_ptr = BPF_MAP_PTR(aux->map_state);
+			map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
 			insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
 						  map_ptr->max_entries, 2);
 			insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -9265,7 +9350,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
 
-			map_ptr = BPF_MAP_PTR(aux->map_state);
+			map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
 			ops = map_ptr->ops;
 			if (insn->imm == BPF_FUNC_map_lookup_elem &&
 			    ops->map_gen_lookup) {
@@ -9345,6 +9430,23 @@ patch_call_imm:
 		insn->imm = fn->func - __bpf_call_base;
 	}
 
+	/* Since poke tab is now finalized, publish aux to tracker. */
+	for (i = 0; i < prog->aux->size_poke_tab; i++) {
+		map_ptr = prog->aux->poke_tab[i].tail_call.map;
+		if (!map_ptr->ops->map_poke_track ||
+		    !map_ptr->ops->map_poke_untrack ||
+		    !map_ptr->ops->map_poke_run) {
+			verbose(env, "bpf verifier is misconfigured\n");
+			return -EINVAL;
+		}
+
+		ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
+		if (ret < 0) {
+			verbose(env, "tracking tail call prog failed\n");
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b8cd76ca4ae34731d47cd6a876d912a08efcc240 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 23 Nov 2019 21:37:31 +0100
Subject: bpf: Add bpf_jit_blinding_enabled for !CONFIG_BPF_JIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a definition of bpf_jit_blinding_enabled() when CONFIG_BPF_JIT is not set
in order to fix a recent build regression:

  [...]
  CC      kernel/bpf/verifier.o
  CC      kernel/bpf/inode.o
kernel/bpf/verifier.c: In function ‘fixup_bpf_calls’:
kernel/bpf/verifier.c:9132:25: error: implicit declaration of function ‘bpf_jit_blinding_enabled’; did you mean ‘bpf_jit_kallsyms_enabled’? [-Werror=implicit-function-declaration]
 9132 |  bool expect_blinding = bpf_jit_blinding_enabled(prog);
      |                         ^~~~~~~~~~~~~~~~~~~~~~~~
      |                         bpf_jit_kallsyms_enabled
  CC      kernel/bpf/helpers.o
  CC      kernel/bpf/hashtab.o
  [...]

Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes")
Reported-by: Jakub Sitnicki <jakub@cloudflare.com>
Reported-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/40baf8f3507cac4851a310578edfb98ce73b5605.1574541375.git.daniel@iogearbox.net
---
 include/linux/filter.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 796b60d8cc6c..1b1e8b8f88da 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1053,6 +1053,11 @@ static inline bool ebpf_jit_enabled(void)
 	return false;
 }
 
+static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
+{
+	return false;
+}
+
 static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
 {
 	return false;
-- 
cgit v1.2.3-59-g8ed1b


From b553a6ec570044fc1ae300c6fb24f9ce204c5894 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 24 Nov 2019 01:39:42 +0100
Subject: bpf: Simplify __bpf_arch_text_poke poke type handling

Given that we have BPF_MOD_NOP_TO_{CALL,JUMP}, BPF_MOD_{CALL,JUMP}_TO_NOP
and BPF_MOD_{CALL,JUMP}_TO_{CALL,JUMP} poke types and that we also pass in
old_addr as well as new_addr, it's a bit redundant and unnecessarily
complicates __bpf_arch_text_poke() itself since we can derive the same from
the *_addr that were passed in. Hence simplify and use BPF_MOD_{CALL,JUMP}
as types which also allows to clean up call-sites.

In addition to that, __bpf_arch_text_poke() currently verifies that text
matches expected old_insn before we invoke text_poke_bp(). Also add a check
on new_insn and skip rewrite if it already matches. Reason why this is rather
useful is that it avoids making any special casing in prog_array_map_poke_run()
when old and new prog were NULL and has the benefit that also for this case
we perform a check on text whether it really matches our expectations.

Suggested-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/fcb00a2b0b288d6c73de4ef58116a821c8fe8f2f.1574555798.git.daniel@iogearbox.net
---
 arch/x86/net/bpf_jit_comp.c | 85 +++++++++++++--------------------------------
 include/linux/bpf.h         | 10 ++----
 kernel/bpf/arraymap.c       | 12 +------
 kernel/bpf/trampoline.c     |  8 ++---
 4 files changed, 32 insertions(+), 83 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 15615c94804f..b8be18427277 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -269,76 +269,42 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 				void *old_addr, void *new_addr,
 				const bool text_live)
 {
-	int (*emit_patch_fn)(u8 **pprog, void *func, void *ip);
 	const u8 *nop_insn = ideal_nops[NOP_ATOMIC5];
-	u8 old_insn[X86_PATCH_SIZE] = {};
-	u8 new_insn[X86_PATCH_SIZE] = {};
+	u8 old_insn[X86_PATCH_SIZE];
+	u8 new_insn[X86_PATCH_SIZE];
 	u8 *prog;
 	int ret;
 
-	switch (t) {
-	case BPF_MOD_NOP_TO_CALL ... BPF_MOD_CALL_TO_NOP:
-		emit_patch_fn = emit_call;
-		break;
-	case BPF_MOD_NOP_TO_JUMP ... BPF_MOD_JUMP_TO_NOP:
-		emit_patch_fn = emit_jump;
-		break;
-	default:
-		return -ENOTSUPP;
+	memcpy(old_insn, nop_insn, X86_PATCH_SIZE);
+	if (old_addr) {
+		prog = old_insn;
+		ret = t == BPF_MOD_CALL ?
+		      emit_call(&prog, old_addr, ip) :
+		      emit_jump(&prog, old_addr, ip);
+		if (ret)
+			return ret;
 	}
 
-	switch (t) {
-	case BPF_MOD_NOP_TO_CALL:
-	case BPF_MOD_NOP_TO_JUMP:
-		if (!old_addr && new_addr) {
-			memcpy(old_insn, nop_insn, X86_PATCH_SIZE);
-
-			prog = new_insn;
-			ret = emit_patch_fn(&prog, new_addr, ip);
-			if (ret)
-				return ret;
-			break;
-		}
-		return -ENXIO;
-	case BPF_MOD_CALL_TO_CALL:
-	case BPF_MOD_JUMP_TO_JUMP:
-		if (old_addr && new_addr) {
-			prog = old_insn;
-			ret = emit_patch_fn(&prog, old_addr, ip);
-			if (ret)
-				return ret;
-
-			prog = new_insn;
-			ret = emit_patch_fn(&prog, new_addr, ip);
-			if (ret)
-				return ret;
-			break;
-		}
-		return -ENXIO;
-	case BPF_MOD_CALL_TO_NOP:
-	case BPF_MOD_JUMP_TO_NOP:
-		if (old_addr && !new_addr) {
-			memcpy(new_insn, nop_insn, X86_PATCH_SIZE);
-
-			prog = old_insn;
-			ret = emit_patch_fn(&prog, old_addr, ip);
-			if (ret)
-				return ret;
-			break;
-		}
-		return -ENXIO;
-	default:
-		return -ENOTSUPP;
+	memcpy(new_insn, nop_insn, X86_PATCH_SIZE);
+	if (new_addr) {
+		prog = new_insn;
+		ret = t == BPF_MOD_CALL ?
+		      emit_call(&prog, new_addr, ip) :
+		      emit_jump(&prog, new_addr, ip);
+		if (ret)
+			return ret;
 	}
 
 	ret = -EBUSY;
 	mutex_lock(&text_mutex);
 	if (memcmp(ip, old_insn, X86_PATCH_SIZE))
 		goto out;
-	if (text_live)
-		text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
-	else
-		memcpy(ip, new_insn, X86_PATCH_SIZE);
+	if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
+		if (text_live)
+			text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
+		else
+			memcpy(ip, new_insn, X86_PATCH_SIZE);
+	}
 	ret = 0;
 out:
 	mutex_unlock(&text_mutex);
@@ -465,7 +431,6 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
 
 static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
 {
-	static const enum bpf_text_poke_type type = BPF_MOD_NOP_TO_JUMP;
 	struct bpf_jit_poke_descriptor *poke;
 	struct bpf_array *array;
 	struct bpf_prog *target;
@@ -490,7 +455,7 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
 			 * read-only. Both modifications on the given image
 			 * are under text_mutex to avoid interference.
 			 */
-			ret = __bpf_arch_text_poke(poke->ip, type, NULL,
+			ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL,
 						   (u8 *)target->bpf_func +
 						   poke->adj_off, false);
 			BUG_ON(ret < 0);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c2f07fd410c1..35903f148be5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1324,14 +1324,8 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 #endif /* CONFIG_INET */
 
 enum bpf_text_poke_type {
-	/* All call-related pokes. */
-	BPF_MOD_NOP_TO_CALL,
-	BPF_MOD_CALL_TO_CALL,
-	BPF_MOD_CALL_TO_NOP,
-	/* All jump-related pokes. */
-	BPF_MOD_NOP_TO_JUMP,
-	BPF_MOD_JUMP_TO_JUMP,
-	BPF_MOD_JUMP_TO_NOP,
+	BPF_MOD_CALL,
+	BPF_MOD_JUMP,
 };
 
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 58bdf5fd24cc..f0d19bbb9211 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -746,19 +746,9 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 				    struct bpf_prog *old,
 				    struct bpf_prog *new)
 {
-	enum bpf_text_poke_type type;
 	struct prog_poke_elem *elem;
 	struct bpf_array_aux *aux;
 
-	if (!old && new)
-		type = BPF_MOD_NOP_TO_JUMP;
-	else if (old && !new)
-		type = BPF_MOD_JUMP_TO_NOP;
-	else if (old && new)
-		type = BPF_MOD_JUMP_TO_JUMP;
-	else
-		return;
-
 	aux = container_of(map, struct bpf_array, map)->aux;
 	WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
 
@@ -806,7 +796,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 			    poke->tail_call.key != key)
 				continue;
 
-			ret = bpf_arch_text_poke(poke->ip, type,
+			ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP,
 						 old ? (u8 *)old->bpf_func +
 						 poke->adj_off : NULL,
 						 new ? (u8 *)new->bpf_func +
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 10ae59d65f13..7e89f1f49d77 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -77,7 +77,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 	int err;
 
 	if (fentry_cnt + fexit_cnt == 0) {
-		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_NOP,
+		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL,
 					 old_image, NULL);
 		tr->selector = 0;
 		goto out;
@@ -105,12 +105,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 
 	if (tr->selector)
 		/* progs already running at this address */
-		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL_TO_CALL,
+		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL,
 					 old_image, new_image);
 	else
 		/* first time registering */
-		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP_TO_CALL,
-					 NULL, new_image);
+		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_CALL, NULL,
+					 new_image);
 	if (err)
 		goto out;
 	tr->selector++;
-- 
cgit v1.2.3-59-g8ed1b


From bec170e55982c2d3b8e1beccadf16e288fe6fb5a Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 23 Nov 2019 17:28:37 +0100
Subject: net: phy: add helpers phy_(un)lock_mdio_bus

Add helpers to make locking/unlocking the MDIO bus easier.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c | 28 ++++++++++++++--------------
 include/linux/phy.h        | 10 ++++++++++
 2 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 5458ed1b87a8..769a076514b0 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -419,9 +419,9 @@ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum)
 {
 	int ret;
 
-	mutex_lock(&phydev->mdio.bus->mdio_lock);
+	phy_lock_mdio_bus(phydev);
 	ret = __phy_read_mmd(phydev, devad, regnum);
-	mutex_unlock(&phydev->mdio.bus->mdio_lock);
+	phy_unlock_mdio_bus(phydev);
 
 	return ret;
 }
@@ -480,9 +480,9 @@ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)
 {
 	int ret;
 
-	mutex_lock(&phydev->mdio.bus->mdio_lock);
+	phy_lock_mdio_bus(phydev);
 	ret = __phy_write_mmd(phydev, devad, regnum, val);
-	mutex_unlock(&phydev->mdio.bus->mdio_lock);
+	phy_unlock_mdio_bus(phydev);
 
 	return ret;
 }
@@ -536,9 +536,9 @@ int phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set)
 {
 	int ret;
 
-	mutex_lock(&phydev->mdio.bus->mdio_lock);
+	phy_lock_mdio_bus(phydev);
 	ret = __phy_modify_changed(phydev, regnum, mask, set);
-	mutex_unlock(&phydev->mdio.bus->mdio_lock);
+	phy_unlock_mdio_bus(phydev);
 
 	return ret;
 }
@@ -580,9 +580,9 @@ int phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set)
 {
 	int ret;
 
-	mutex_lock(&phydev->mdio.bus->mdio_lock);
+	phy_lock_mdio_bus(phydev);
 	ret = __phy_modify(phydev, regnum, mask, set);
-	mutex_unlock(&phydev->mdio.bus->mdio_lock);
+	phy_unlock_mdio_bus(phydev);
 
 	return ret;
 }
@@ -639,9 +639,9 @@ int phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum,
 {
 	int ret;
 
-	mutex_lock(&phydev->mdio.bus->mdio_lock);
+	phy_lock_mdio_bus(phydev);
 	ret = __phy_modify_mmd_changed(phydev, devad, regnum, mask, set);
-	mutex_unlock(&phydev->mdio.bus->mdio_lock);
+	phy_unlock_mdio_bus(phydev);
 
 	return ret;
 }
@@ -687,9 +687,9 @@ int phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum,
 {
 	int ret;
 
-	mutex_lock(&phydev->mdio.bus->mdio_lock);
+	phy_lock_mdio_bus(phydev);
 	ret = __phy_modify_mmd(phydev, devad, regnum, mask, set);
-	mutex_unlock(&phydev->mdio.bus->mdio_lock);
+	phy_unlock_mdio_bus(phydev);
 
 	return ret;
 }
@@ -721,7 +721,7 @@ static int __phy_write_page(struct phy_device *phydev, int page)
  */
 int phy_save_page(struct phy_device *phydev)
 {
-	mutex_lock(&phydev->mdio.bus->mdio_lock);
+	phy_lock_mdio_bus(phydev);
 	return __phy_read_page(phydev);
 }
 EXPORT_SYMBOL_GPL(phy_save_page);
@@ -788,7 +788,7 @@ int phy_restore_page(struct phy_device *phydev, int oldpage, int ret)
 		ret = oldpage;
 	}
 
-	mutex_unlock(&phydev->mdio.bus->mdio_lock);
+	phy_unlock_mdio_bus(phydev);
 
 	return ret;
 }
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f5cdfb206097..5032d453ac66 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1076,6 +1076,16 @@ static inline const char *phydev_name(const struct phy_device *phydev)
 	return dev_name(&phydev->mdio.dev);
 }
 
+static inline void phy_lock_mdio_bus(struct phy_device *phydev)
+{
+	mutex_lock(&phydev->mdio.bus->mdio_lock);
+}
+
+static inline void phy_unlock_mdio_bus(struct phy_device *phydev)
+{
+	mutex_unlock(&phydev->mdio.bus->mdio_lock);
+}
+
 void phy_attached_print(struct phy_device *phydev, const char *fmt, ...)
 	__printf(2, 3);
 void phy_attached_info(struct phy_device *phydev);
-- 
cgit v1.2.3-59-g8ed1b